ó
    fT–hœ ã                   ó~  • S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
r
SSKr
SSK
JrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  \R<                  " \5      r S;S jr!S r"S;S jr#S r$\ " S S\5      5       r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(\ " S S\5      5       r) " S S\RT                  5      r+ " S S \RX                  5      r- " S! S"\RX                  5      r. " S# S$\RX                  5      r/ " S% S&\RX                  5      r0 " S' S(\RX                  5      r1\" S)S*9 " S+ S,\)5      5       r2\" S-S*9 " S. S/\)5      5       r3\ " S0 S1\)5      5       r4\" S2S*9 " S3 S4\)\5      5       r5\" S5S*9 " S6 S7\)\5      5       r6 " S8 S9\)5      r7/ S:Qr8g)<zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).é    N)Ú	dataclass)ÚOptionalÚTupleÚUnion)ÚTensorÚnn)Ú	LayerNormé   )ÚACT2FN)ÚGenerationMixin)ÚBaseModelOutput)ÚPreTrainedModel)ÚModelOutputÚauto_docstringÚloggingé   )ÚProphetNetConfigc                 óÀ   • U(       a,  [         R                  R                  U R                  5       US9$ [         R                  R                  X[        R
                  S9$ )N©Údim©r   Údtype)r   Ú
functionalÚsoftmaxÚfloatÚtorchÚfloat32)Úhidden_stater   Ú
onnx_traces      Új/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   '   sF   € ÞÜ}‰}×$Ñ$ \×%7Ñ%7Ó%9¸sÐ$ÐCÐCä}‰}×$Ñ$ \Ä%Ç-Á-Ð$ÐPÐPó    c                 óh  • [         R                  " XU 4X#S9[         R                  " U5      R                  -  nUR	                  5       R                  5       n[        U5       H,  nXV   R                  SSS9  XF   R                  U* S-   5        M.     SUSS2SS2S4'   [         R                  " XE/SS9$ )	z8
This function computes the bias for the predict stream
)Údevicer   r   F)Úwrapr   Né   r   )
r   ÚonesÚfinfoÚminÚdetachÚcloneÚrangeÚfill_diagonal_Útriu_Úcat)Úsequence_lengthÚngramr#   r   Ú
left_blockÚright_blockÚ
stream_idxs          r    Úngram_attention_biasr4   .   s®   € ô
 	
Š
E¨OÐ<ÀVÑYÔ\a×\gÒ\gÐhmÓ\n×\rÑ\rÑrð ð ×#Ñ#Ó%×+Ñ+Ó-€Kä˜E–lˆ
ØÑ×.Ñ.¨q°uÐ.Ñ=ØÑ×$Ñ$ j [°1¡_Ö5ñ #ð €JŠq’!QˆwÑÜ9Š9jÐ.°AÑ6Ð6r!   c                 ó¾  • U* nSnU(       a[  U S-  n U[         R                  " U[         R                  " U5      5      R                  5       U -  -   n[         R                  " U5      nO+[         R
                  " U[         R                  " U5      5      nU S-  n[         R                  " XF5      nU[         R                  " UR                  5       U-  5      [        R                  " X-  5      -  X-
  -  -   n[         R                  " U[         R                  " U5      U S-
  -  5      R                  5       nU[         R                  " XtR                  5       U5      -   nU$ )zg
This function computes individual parts of the relative position buckets. For more detail, see paper.
r   r%   r   )r   ÚltÚ
zeros_likeÚintÚabsÚmaxÚlogr   Úmathr(   Ú	ones_likeÚwhere)	Únum_bucketsÚmax_distanceÚrelative_positionsÚis_bidirectionalÚinv_relative_positionsÚrel_positions_bucketÚ	max_exactÚis_smallÚval_if_larges	            r    Úcompute_relative_bucketsrH   ?   s>  € ð 1Ð0ÐØÐæØ! QÑ&ˆà ÜhŠhÐ-¬u×/?Ò/?Ð@VÓ/WÓX×\Ñ\Ó^ÐalÑlñmð 	ô "'§¢Ð+AÓ!BÑä!&§¢Ð+AÄ5×CSÒCSÐTjÓCkÓ!lÐà˜qÑ €IÜxŠxÐ.Ó:€HØœuŸyšyÐ)?×)EÑ)EÓ)GÈ)Ñ)SÓTÔW[×W_ÒW_ØÑ óXñ  à	Ñ	 ñ "ñ "€Lô —9’9˜\¬5¯?ª?¸<Ó+HÈKÐZ[ÉOÑ+\Ó]×aÑaÓc€LØ/´%·+²+¸h×HbÑHbÓHdÐfrÓ2sÑsÐØÐr!   c                 óx  • UR                  S5      R                  SUR                  S5      S5      nX2R                  S5      -
  n[        R                  " US-
  U4SS9R                  S5      nUR                  SUR                  S5      S5      nXBR                  S5      -
  n[        XUSS9n[        XUSS9nXV4$ )ze
This function computes both main and predict relative position buckets. For more detail, see paper.
r   éÿÿÿÿr   F)rB   )Ú	unsqueezeÚrepeatÚsizer   r.   rH   )r?   r@   Úposition_idsÚmain_stream_relative_positionsÚ$predicting_stream_relative_positionsÚmain_relative_position_bucketsÚ!predict_relative_position_bucketss          r    Ú#compute_all_stream_relative_bucketsrS   Z   sá   € ð
 &2×%;Ñ%;¸AÓ%>×%EÑ%EÀaÈ×IZÑIZÐ[]ÓI^Ð`aÓ%bÐ"Ø%C×F\ÑF\Ð]_ÓF`Ñ%`Ð"ô ,1¯9ª9°lÀQÑ6FÈÐ5UÐ[]Ñ+^×+hÑ+hÐijÓ+kÐ(Ø+O×+VÑ+VÐWXÐZf×ZkÑZkÐlnÓZoÐqrÓ+sÐ(Ø+O×RhÑRhÐikÓRlÑ+lÐ(ô &>ØÐ#AÐTYñ&Ð"ô )AØÐ#GÐZ_ñ)Ð%ð *ÐLÐLr!   c                   ó:  • \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   \S 5       rSrg)ÚProphetNetSeq2SeqLMOutputéq   a6  
Base class for sequence-to-sequence language models outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, encoder_sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
        softmax, used to compute the weighted average in the self-attention heads.
NÚlossÚlogitsÚlogits_ngramÚpast_key_valuesÚdecoder_hidden_statesÚdecoder_ngram_hidden_statesÚdecoder_attentionsÚdecoder_ngram_attentionsÚcross_attentionsÚencoder_last_hidden_stateÚencoder_hidden_statesÚencoder_attentionsc                 óP   • [         R                  " S[        5        U R                  $ ©Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.©ÚwarningsÚwarnÚFutureWarningr_   ©Úselfs    r    Údecoder_cross_attentionsÚ2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions¼   ó$   € äŠðäô	
ð
 ×$Ñ$Ð$r!   © )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__rW   r   r   ÚFloatTensorÚ__annotations__rX   rY   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   Úpropertyrk   Ú__static_attributes__rn   r!   r    rU   rU   q   sH  ‡ ñ:ðx )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø*.€FˆHU×&Ñ&Ñ'Ó.Ø04€L(˜5×,Ñ,Ñ-Ó4Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØFJÐ ¨%°×0AÑ0AÑ*BÑ!CÓJØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAØCGÐ˜h u¨U×->Ñ->Ñ'?Ñ@ÓGØ;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø=AÐ˜x¨×(9Ñ(9Ñ:ÓAØ@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAàñ%ó ó%r!   rU   c                   ó  • \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr
\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S	'   Sr\	\\R                        \S
'   Sr\	\\R                        \S'   Sr\	\R                     \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   \S 5       rSrg)ÚProphetNetSeq2SeqModelOutputéÆ   aj  
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, encoder_sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, encoder_sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
Úlast_hidden_stateNÚlast_hidden_state_ngramrZ   r[   r\   r]   r^   r_   r`   ra   rb   c                 óP   • [         R                  " S[        5        U R                  $ rd   re   ri   s    r    rk   Ú5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions  rm   r!   rn   )ro   rp   rq   rr   rs   r   rt   ru   r|   r   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   rv   rk   rw   rn   r!   r    ry   ry   Æ   s+  ‡ ñ<ð| ×(Ñ(Ó(Ø;?Ð˜X e×&7Ñ&7Ñ8Ó?Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØFJÐ ¨%°×0AÑ0AÑ*BÑ!CÓJØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAØCGÐ˜h u¨U×->Ñ->Ñ'?Ñ@ÓGØ;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø=AÐ˜x¨×(9Ñ(9Ñ:ÓAØ@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAàñ%ó ó%r!   ry   c                   ót  • \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr
\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S	'   Sr\	\\R                        \S
'   Sr\	\\R                        \S'   Srg)ÚProphetNetDecoderModelOutputi  aÂ  
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
r{   Nr|   rZ   Úhidden_statesÚhidden_states_ngramÚ
attentionsÚngram_attentionsr_   rn   )ro   rp   rq   rr   rs   r   rt   ru   r|   r   rZ   r   r   r‚   rƒ   r„   r_   rw   rn   r!   r    r€   r€     sË   ‡ ñ.ð` ×(Ñ(Ó(Ø;?Ð˜X e×&7Ñ&7Ñ8Ó?Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø8<€M8˜E %×"3Ñ"3Ñ4Ñ5Ó<Ø>BÐ˜ %¨×(9Ñ(9Ñ":Ñ;ÓBØ59€J˜˜u×0Ñ0Ñ1Ñ2Ó9Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ö?r!   r€   c                   ó¦  • \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)ÚProphetNetDecoderLMOutputiX  aÍ  
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
NrW   rX   rY   rZ   r   r‚   rƒ   r„   r_   rn   )ro   rp   rq   rr   rs   rW   r   r   rt   ru   rX   rY   rZ   r   r   r‚   rƒ   r„   r_   rw   rn   r!   r    r†   r†   X  sè   ‡ ñ/ðb )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø*.€FˆHU×&Ñ&Ñ'Ó.Ø04€L(˜5×,Ñ,Ñ-Ó4Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø8<€M8˜E %×"3Ñ"3Ñ4Ñ5Ó<Ø>BÐ˜ %¨×(9Ñ(9Ñ":Ñ;ÓBØ59€J˜˜u×0Ñ0Ñ1Ñ2Ó9Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ö?r!   r†   c                   ó,   • \ rS rSr\rSrSrS rS r	Sr
g)ÚProphetNetPreTrainedModeli–  Ú
prophetnetTc                 óF  • [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         g g g )Nç        )ÚmeanÚstd)Ú
isinstancer   ÚLinearÚweightÚdataÚnormal_ÚconfigÚinit_stdÚbiasÚzero_Ú	EmbeddingÚpadding_idx)rj   Úmodules     r    Ú_init_weightsÚ'ProphetNetPreTrainedModel._init_weightsœ  sÎ   € ÜfœbŸi™i×(Ñ(ØM‰M×Ñ×&Ñ&¨C°T·[±[×5IÑ5IÐ&ÑJØ{‰{Ñ&Ø—‘× Ñ ×&Ñ&Õ(ð 'ä˜¤§¡×-Ñ-ØM‰M×Ñ×&Ñ&¨C°T·[±[×5IÑ5IÐ&ÑJØ×!Ñ!Ñ-Ø—‘×"Ñ" 6×#5Ñ#5Ñ6×<Ñ<Õ>ð .ð .r!   c                 ó  • U R                   R                  nU R                   R                  nUc   S5       eUR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc   S5       eUR                  US:H  U5        [        R                  " US:¬  5      R                  5       (       d   S	5       eU$ )
Nz™self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rJ   r   ).r   z1self.model.config.pad_token_id has to be defined.éœÿÿÿr   z8Verify that `shifted_input_ids` has only positive values)
r“   Údecoder_start_token_idÚpad_token_idÚ	new_zerosÚshaper*   Úmasked_fill_r   ÚallÚitem)rj   Ú	input_idsrž   rŸ   Úshifted_input_idss        r    Ú_shift_rightÚ&ProphetNetPreTrainedModel._shift_right¦  sØ   € Ø!%§¡×!CÑ!CÐØ—{‘{×/Ñ/ˆà%Ñ1ð 	
ðFó	
Ð1ð &×/Ñ/°	·±Ó@ÐØ%.¨s°C°R°C¨xÑ%8×%>Ñ%>Ó%@Ð˜#˜q™r˜'Ñ"Ø$:˜&Ñ!àÑ'Ð\Ð)\Ó\Ð'à×&Ñ&Ð'8¸DÑ'@À,ÔOäyŠyÐ*¨aÑ/Ó0×5Ñ5×7Ñ7ÐsÐ9sÓsÐ7à Ð r!   rn   N)ro   rp   rq   rr   r   Úconfig_classÚbase_model_prefixÚsupports_gradient_checkpointingrš   r§   rw   rn   r!   r    rˆ   rˆ   –  s   † à#€LØ$ÐØ&*Ð#ò?õ!r!   rˆ   c                   óT   ^ • \ rS rSrSrS\SS4U 4S jjrS
U 4S jjrU 4S jrS	r	U =r
$ )ÚProphetNetPositionalEmbeddingsi½  zý
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
r“   ÚreturnNc                 ó†   >• UR                   U l        [        TU ]  UR                   UR                  UR
                  5        g ©N)Úmax_position_embeddingsÚ
max_lengthÚsuperÚ__init__Úhidden_sizerŸ   ©rj   r“   Ú	__class__s     €r    r´   Ú'ProphetNetPositionalEmbeddings.__init__Ä  s3   ø€ Ø ×8Ñ8ˆŒÜ‰Ñ˜×7Ñ7¸×9KÑ9KÈV×M`ÑM`Õar!   c                 ó&  >• Ub  U R                   b   S5       eUcæ  Ub[  US   S   R                  S   nUS   U-   n[        R                  " S[        R                  US9[        U R                   U-   5      -  nOˆUc$  [        R                  " U[        R                  US9n[        R                  " USS9R                  U5      U-  R	                  5       U R                   -   nUR                  SU R                  S-
  5      n[        TU ]-  U5      U4$ )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r%   r   )r   r   ©r   r#   r   )r˜   r¡   r   r&   Úlongr8   ÚcumsumÚtype_asÚclampr²   r³   Úforward)	rj   Úinputs_shaper#   Úattention_maskrZ   rN   Úprev_num_input_idsÚnum_input_idsr·   s	           €r    r¿   Ú&ProphetNetPositionalEmbeddings.forwardÈ  s  ø€ ØÑ$¨$×*:Ñ*:Ñ*Bð 	
ØQó	
ÐCð ÑØÑ*ð &5°QÑ%7¸Ñ%:×%@Ñ%@ÀÑ%CÐ"Ø ,¨Q¡Ð2DÑ DÜ$Ÿzšz¨&¼¿
¹
È6ÑRÜ˜×(Ñ(¨=Ñ8Ó9ñ ‘ð "Ñ)Ü%*§Z¢Z°ÄEÇJÁJÐW]Ñ%^Nô —L’L °QÑ7×?Ñ?ÀÓOÐR`Ñ`ß‘$“&˜4×+Ñ+ñ ,ð
  ,×1Ñ1°!°T·_±_ÀqÑ5HÓIä‰w‰˜|Ó,¨lÐ:Ð:r!   c                 ó"   >• [         TU ]  U5      $ r°   )r³   r¿   )rj   rN   r·   s     €r    Ú_forwardÚ'ProphetNetPositionalEmbeddings._forwardä  s   ø€ Ü‰w‰˜|Ó,Ð,r!   )r²   )NNN)ro   rp   rq   rr   rs   r   r´   r¿   rÆ   rw   Ú__classcell__©r·   s   @r    r­   r­   ½  s.   ø† ñðbÐ/ð b°D÷ b÷;÷8-ó -r!   r­   c                   óÀ   ^ • \ rS rSrSrS\S\4U 4S jjrS\R                  S\S\4S	 jr
     SS
\\	   S\\	   S\\	   S\\\	      S\S\\	\\	   4   4S jjrSrU =r$ )ÚProphetNetAttentioniè  z=Multi-headed attention from 'Attention Is All You Need' paperr“   Únum_attn_headsc                 ó¨  >• [         TU ]  5         UR                  nUR                  U l        UR                  U l        X l        X2-  U l        U R                  U-  U:X  d   S5       e[        R                  " X35      U l	        [        R                  " X35      U l
        [        R                  " X35      U l        [        R                  " X35      U l        g )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r³   r´   rµ   Úattention_dropoutÚdropoutrÌ   Úhead_dimr   r   Úkey_projÚ
value_projÚ
query_projÚout_proj)rj   r“   rÌ   rµ   r·   s       €r    r´   ÚProphetNetAttention.__init__ë  s©   ø€ ô
 	‰ÑÔØ×(Ñ(ˆà!'×!9Ñ!9ˆÔØ—~‘~ˆŒØ,ÔØ#Ñ5ˆŒà}‰}˜~Ñ-°Ó<ð 	
ð4ó	
Ð<ô
 Ÿ	š	 +Ó;ˆŒÜŸ)š) KÓ=ˆŒÜŸ)š) KÓ=ˆŒäŸ	š	 +Ó;ˆr!   ÚtensorÚseq_lenÚbszc                 óŒ   • UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ ©Nr   r%   ©ÚviewrÌ   rÐ   Ú	transposeÚ
contiguous)rj   rÖ   r×   rØ   s       r    Ú_shapeÚProphetNetAttention._shape  s7   € Ø{‰{˜3¨×)<Ñ)<¸d¿m¹mÓL×VÑVÐWXÐZ[Ó\×gÑgÓiÐir!   Úkey_value_statesrÁ   Úlayer_head_maskÚpast_key_valueÚoutput_attentionsr®   c                 ó*  • UR                  5       u  pxn	US Ln
[        UR                  5       5      UUU	/:X  d   SXxU	4 SUR                  5        35       eU R                  U5      U R                  S-  -  nU
(       a  Ub  US   nUS   nOU
(       aE  U R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nODU R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nU
(       a  XÍ4nXpR                  SU R                  4nU R	                  X¸U5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " SX¼R                  SS	5      5      nXpR                  X4nUR                  5       U:w  a  [        S
U SUR                  5        35      eUb  UR                  5       S:X  a  S nXpR                  SU4nUb3  UR                  5       U:w  a  [        SU SUR                  5        35      eUb  UU-   nU(       a  UnOS n[        R                  R!                  USS9nUb  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSS5      UR                  XpR                  X5      -  nUR                  SSSS5      U-  n[        R                  R#                  UU R$                  U R&                  S9n[        R                  " SUU5      nXpR                  X€R                  4nUR                  5       U:w  a  [        SU SUR                  5        35      eUR                  SS5      R)                  XxU	5      nU R+                  U5      n[        R                  R#                  UU R"                  U R&                  S9nUUU4$ )Nz Size of hidden states should be ú	, but is ç      à?r   r   rJ   r%   zbsij,bsjk->bsikr
   z#Attention weights should have size z Attention mask should have size r   ú/Head mask for a single layer should be of size ©ÚpÚtrainingz `attn_output` should have shape ú, but is of shape )rM   ÚlistrÓ   rÐ   rß   rÑ   rÒ   rÌ   rÜ   r   ÚeinsumrÝ   Ú
ValueErrorr   r   r   r   rÏ   rÎ   rë   ÚreshaperÔ   )rj   r   rá   rÁ   râ   rã   rä   Ú
batch_sizeÚtgt_lenrµ   Úis_cross_attentionÚquery_statesÚ
key_statesÚvalue_statesÚ
proj_shapeÚsrc_lenÚattn_weightsÚexpected_shapeÚattn_weights_reshapedÚ
attn_probsÚattn_outputs                        r    r¿   ÚProphetNetAttention.forward  s8  € ð ,9×+=Ñ+=Ó+?Ñ(ˆ
˜[ð .°TÐ9ÐÜM×&Ñ&Ó(Ó)ØØØð.
ó 
ð 	pð .¨jÀ;Ð.NÐ-OÈyÐYf×YkÑYkÓYmÐXnÐoó		pð 
ð —‘ }Ó5¸¿¹ÈÑ9KÑLˆæ .Ñ"<à'¨Ñ*ˆJØ)¨!Ñ,‰LÞàŸ™ T§]¡]Ð3CÓ%DÀbÈ*ÓUˆJØŸ;™; t§¡Ð7GÓ'HÈ"ÈjÓY‰Lð Ÿ™ T§]¡]°=Ó%AÀ2ÀzÓRˆJØŸ;™; t§¡°}Ó'EÀrÈ:ÓVˆLæð
 )Ð7ˆNð !×"5Ñ"5°r¸4¿=¹=ÐIˆ
Ø—{‘{ <¸*ÓE×JÒJÈJÐWˆØ—_’_ jÐ1ˆ
Ø#×(Ò(¨*Ð5ˆØ—/‘/ !Ó$ˆÜ—|’|Ð$5°|×EYÑEYÐZ[Ð]^ÓE_Ó`ˆØ$×&9Ñ&9¸7ÐLˆØ×ÑÓ .Ó0ÜÐBÀ>ÐBRÐR[Ð\h×\mÑ\mÓ\oÐ[pÐqÓrÐrð Ñ%¨.×*<Ñ*<Ó*>À!Ó*CØ!ˆNà$×&9Ñ&9¸1¸gÐFˆØÑ%¨.×*=Ñ*=Ó*?À>Ó*QÜÐ?ÀÐ?OÈyÐYg×YlÑYlÓYnÐXoÐpÓqÐqØÑ%Ø'¨.Ñ8ˆLÞØ$0Ñ!à$(Ð!ä—}‘}×,Ñ,¨\¸rÐ,ÐBˆàÑ&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÓCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð +×/Ñ/°°2°q¸!Ó<¸|×?PÑ?PØ×/Ñ/°ó@ñ ˆLð
 %4×$8Ñ$8¸¸BÀÀ1Ó$EÐH]Ñ$]Ð!ä—]‘]×*Ñ*ØØ×$Ñ$Ø—]‘]ð +ð 
ˆ
ô
 —l’lÐ#4°jÀ,ÓOˆØ$×&9Ñ&9¸7ÇMÁMÐRˆØ×ÑÓ Ó/ÜÐ?ÀÐ?OÐOaÐbm×brÑbrÓbtÐauÐvÓwÐwà!×+Ñ+¨A¨qÓ1×9Ñ9¸*È{Ó[ˆØ—m‘m KÓ0ˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ð`ˆØÐ1°>ÐAÐAr!   )rÎ   rÏ   rÐ   rÑ   rÌ   rÔ   rÓ   rÒ   )NNNNF)ro   rp   rq   rr   rs   r   r8   r´   r   r   rß   r   r   Úboolr¿   rw   rÈ   rÉ   s   @r    rË   rË   è  sÍ   ø† ÙGð<à ð<ð ÷<ð0j˜UŸ\™\ð j°Cð j¸cô jð .2Ø+/Ø,0Ø26Ø"'ñ`Bð # 6Ñ*ð`Bð ! Ñ(ð	`Bð
 " &Ñ)ð`Bð !  v¡Ñ/ð`Bð  ð`Bð 
ˆvx Ñ'Ð'Ñ	(÷`Bó `Br!   rË   c                   ó>   ^ • \ rS rSrSrS\S\4U 4S jjrS rSr	U =r
$ )ÚProphetNetFeedForwardii  ze
This is the residual two feed-forward layer block based on the original Transformer implementation.
r“   Úffn_dimc                 ó,  >• [         TU ]  5         [        UR                     U l        [
        R                  " UR                  U5      U l        [
        R                  " X!R                  5      U l	        UR                  U l
        UR                  U l        g r°   )r³   r´   r   Úactivation_functionÚactivation_fnr   r   rµ   ÚintermediateÚoutputÚactivation_dropoutrÏ   )rj   r“   r  r·   s      €r    r´   ÚProphetNetFeedForward.__init__n  si   ø€ Ü‰ÑÔÜ# F×$>Ñ$>Ñ?ˆÔÜŸIšI f×&8Ñ&8¸'ÓBˆÔÜ—i’i ×);Ñ);Ó<ˆŒØ"(×";Ñ";ˆÔØ—~‘~ˆr!   c                 ó4  • U R                  U5      nU R                  U5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      n[        R                  R	                  XR                  U R                  S9nU$ )Nré   )r  r  r   r   rÏ   r  rë   r  )rj   r   s     r    r¿   ÚProphetNetFeedForward.forwardv  s€   € Ø×)Ñ)¨-Ó8ˆØ×*Ñ*¨=Ó9ˆäŸ™×-Ñ-¨m×?VÑ?VÐae×anÑanÐ-ÐoˆØŸ™ MÓ2ˆÜŸ™×-Ñ-¨m¿|¹|ÐVZ×VcÑVcÐ-ÐdˆØÐr!   )r  r  rÏ   r  r  )ro   rp   rq   rr   rs   r   r8   r´   r¿   rw   rÈ   rÉ   s   @r    r  r  i  s&   ø† ñð&Ð/ð &¸#÷ &÷ð r!   r  c                   ót   ^ • \ rS rSrS\4U 4S jjrS rS r       SS\\	\
      4S jjrS rS	 rS
rU =r$ )ÚProphetNetNgramSelfAttentioni€  r“   c                 ó°  >• [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U R                  -  U l	        UR                  U l
        U R                  U R                  -  UR                  :X  d   S5       e[        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  U R                  U R                  -  5      U l        SU l        g )Nz6config.hidden_size must be divisible by num_attn_headsF)r³   r´   rµ   r?   Úrelative_max_distanceÚnum_decoder_attention_headsrÌ   rÏ   rÎ   rÐ   r0   r   r   rÑ   rÒ   rÓ   rÔ   Úrelative_pos_embeddingsr   r¶   s     €r    r´   Ú%ProphetNetNgramSelfAttention.__init__  sa  ø€ Ü‰ÑÔØ!×-Ñ-ˆÔà!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø$×@Ñ@ˆÔØ—~‘~ˆŒØ!'×!9Ñ!9ˆÔØ×*Ñ*¨d×.AÑ.AÑAˆŒØ—\‘\ˆŒ
à}‰}˜t×2Ñ2Ñ2°f×6HÑ6HÓHð 	
ØDó	
ÐHô Ÿ	š	 &×"4Ñ"4°f×6HÑ6HÓIˆŒÜŸ)š) F×$6Ñ$6¸×8JÑ8JÓKˆŒÜŸ)š) F×$6Ñ$6¸×8JÑ8JÓKˆŒô Ÿ	š	 &×"4Ñ"4°f×6HÑ6HÓIˆŒô (*§y¢y°×1CÑ1CÀT×EUÑEUÐX\×XkÑXkÑEkÓ'lˆÔ$ð  ˆr!   c                 óŒ   • UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ rÚ   rÛ   )rj   rÖ   r×   rñ   s       r    rß   Ú#ProphetNetNgramSelfAttention._shapež  s7   € Ø{‰{˜:°×0CÑ0CÀTÇ]Á]ÓS×]Ñ]Ð^_ÐabÓc×nÑnÓpÐpr!   c                 ó   • SU l         g )NT)r   ri   s    r    Úprepare_for_onnx_export_Ú5ProphetNetNgramSelfAttention.prepare_for_onnx_export_¡  s	   € Øˆr!   rã   c	           	      óº  • UR                  5       u  pšn[        UR                  5       5      XšU/:X  d   SXšU4 SUR                   35       eU R                  U5      nU R	                  U5      nU R                  U5      nXÀR                  S-  -  nU R                  XÊU	5      nU R                  USU	5      nU R                  USU	5      nXR                  SU R                  4nUR                  " U6 nUR                  " U6 nUR                  " U6 nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUS   USS  nnUS   USS  nnUS   USS  nnUS   USS  nnUb8  US   n[        R                  " UU4SS9nUS   n[        R                  " UU4SS9nUU4nU
SU R                  -   -  n[        R                  " S	UUR                  SS
5      5      nU R!                  UUX†5      n UU -   nUb  UU-   n[#        USU R$                  S9R'                  U5      n!Ubw  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSS5      U!R                  XR                  SU5      -  n![(        R*                  R-                  U!U R.                  U R0                  S9n![        R                  " S	U!U5      n"U"R                  SS5      R3                  U	SUU5      n"U R5                  U"5      n"[        R6                  " US5      R                  XR                  U R                  UU R                  5      n#[        R6                  " U V$s/ s H  n$[        R                  " UU$/S5      PM     sn$S5      n%[        R6                  " USS9n&[        R                  " U V's/ s H+  n'[        R                  " UU'/S5      R9                  S5      PM-     sn'S5      n([        R                  " SU#U%45      n)U R;                  U&U)X‡5      n*U)U*-   n)Ub5  UR=                  SSSS
S5      nUR?                  U)R@                  5      nU)U-   n)[#        U)SU R$                  S9R'                  U)5      n+Ub]  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSSS5      U+-  n+[(        R*                  R-                  U+U R.                  U R0                  S9n+[        R                  " SU+U(R                  SS5      45      n,U,R                  SS
5      n,U,R3                  XR                  UU5      n,U R5                  U,5      n,[        R                  " U"U,/S5      R                  U	SU5      n-U!R                  XR                  US5      n![(        R*                  R-                  U-U R,                  U R0                  S9n-U-U!U+U4$ s  sn$f s  sn'f )Nz#`hidden_states` should be of shape rì   rç   rJ   r   r   r%   r   zbntc,bncs->bntsr
   )r   r   rè   ræ   ré   zbnhtc,bnhsc->bnhtsé   zbnhts,bnhsc->bnhtc)!rM   rí   r¡   rÓ   rÑ   rÒ   rÐ   rß   rÌ   rÜ   Úchunkr0   r   r.   rî   rÝ   Ú get_main_relative_pos_embeddingsr   r   r½   r   r   rÏ   rÎ   rë   rð   rÔ   ÚstackrK   Ú#get_predict_relative_pos_embeddingsÚpermuteÚtor   ).rj   r   rã   rÁ   râ   Úextended_predict_attention_maskrQ   rR   rN   rñ   Úngram_sequence_lengthrµ   rô   rõ   rö   r÷   Úhidden_states_listÚquery_states_listÚkey_states_listÚvalue_states_listÚmain_hidden_statesÚhidden_states_predict_listÚmain_query_statesÚpredict_query_states_listÚmain_key_statesÚpredict_key_states_listÚmain_value_statesÚpredict_value_states_listÚprev_main_key_statesÚprev_main_value_statesr/   Úmain_attn_weightsÚmain_relative_pos_embeddingsÚmain_attn_probsÚmain_attn_outputÚpredict_query_statesÚkeyÚpredict_key_statesÚpredict_hidden_statesÚv_pÚpredict_value_statesÚpredict_attn_weightsÚpredict_relative_pos_embeddingsÚpredict_attn_probsÚpredict_attn_outputrý   s.                                                 r    r¿   Ú$ProphetNetNgramSelfAttention.forward¤  s  € ð :G×9KÑ9KÓ9MÑ6ˆ
¨;ÜM×&Ñ&Ó(Ó)¨jÐQ\Ð-]Ó]ð 	
Ø1°*ÐU`Ð2`Ð1að bØ×#Ñ#Ð$ð&ó	
Ð]ð —‘ }Ó5ˆØ—]‘] =Ó1ˆ
Ø—‘ }Ó5ˆð $§}¡}°cÑ'9Ñ:ˆð —{‘{ <È
ÓSˆØ—[‘[ ¨R°Ó<ˆ
Ø—{‘{ <°°ZÓ@ˆØ ×"5Ñ"5°r¸4¿=¹=ÐIˆ
à#×(Ò(¨*Ð5ˆØ—_’_ jÐ1ˆ
Ø#×(Ò(¨*Ð5ˆð +×0Ñ0°°T·Z±Z±ÀQÐ0ÐGÐØ(×.Ñ.¨q°4·:±:©~À1Ð.ÐEÐØ$×*Ñ*¨1¨t¯z©z©>¸qÐ*ÐAˆØ(×.Ñ.¨q°4·:±:©~À1Ð.ÐEÐà9KÈAÑ9NÐPbÐcdÐceÐPfÐ6ÐØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4ÐØ3BÀ1Ñ3EÀÐWXÐWYÐGZÐ0ˆØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4Ðð Ñ%Ø#1°!Ñ#4Ð Ü#ŸišiÐ)=¸Ð(OÐUVÑWˆOØ%3°AÑ%6Ð"Ü %§	¢	Ð+AÐCTÐ*UÐ[\Ñ ]Ðð *Ð+<Ð=ˆð 0°A¸¿
¹
±NÑCˆô "ŸLšLÐ):Ð<MÈ×OhÑOhÐijÐlmÓOnÓoÐð (,×'LÑ'LØÐ 1°<ó(
Ð$ð .Ð0LÑLÐàÑ%Ø 1°NÑ BÐä!ØØØ—‘ñ
÷ ‰'Ð#Ó
$ð	 	ð Ñ&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÓCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð .×2Ñ2°1°b¸!¸QÓ?À/×BVÑBVØ×/Ñ/°°_óCñ ˆOô Ÿ-™-×/Ñ/°À4×CYÑCYÐdh×dqÑdqÐ/Ðrˆô
 !Ÿ<š<Ð(9¸?ÐL]Ó^Ðà+×5Ñ5°a¸Ó;×CÑCÀJÐPQÐSbÐdoÓpÐØŸ=™=Ð)9Ó:Ðô  %Ÿ{š{Ð+DÀaÓH×MÑMØŸ
™
 D×$7Ñ$7¸È$Ï-É-ó 
Ðô
 #Ÿ[š[ÑZqÓ)rÒZqÐSV¬%¯)ª)°_ÀcÐ4JÈAÖ*NÑZqÑ)rÐtuÓvÐô !&§¢Ð,FÈAÑ NÐô  %ŸyšyÙLeÓfÒLeÀSŒUYŠYÐ)¨3Ð/°Ó3×=Ñ=¸aÖ@ÑLeÑfÐhió 
Ðô  %Ÿ|š|Ð,@ÐCWÐYkÐBlÓmÐð +/×*RÑ*RØ!Ð#7¸ó+
Ð'ð
  4Ð6UÑUÐà*Ñ6à.M×.UÑ.UÐVWÐYZÐ\]Ð_`ÐbcÓ.dÐ+Ø.M×.PÑ.PÐQe×QkÑQkÓ.lÐ+Ø#7Ð:YÑ#YÐ ä$Ø ØØ—‘ñ
÷ ‰'Ð&Ó
'ð	 	ð Ñ&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÓCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð "1×!5Ñ!5°a¸¸BÀÀ1Ó!EÐHZÑ!ZÐäŸ]™]×2Ñ2Ø $×"8Ñ"8À4Ç=Á=ð 3ð 
Ðô $ŸlšlØ Ð#5Ð7K×7UÑ7UÐVWÐYZÓ7[Ð"\ó
Ðð 2×;Ñ;¸A¸qÓAÐØ1×9Ñ9¸*ÇjÁjÐRaÐcnÓoÐØ"Ÿm™mÐ,?Ó@Ðô —i’iÐ!1Ð3FÐ GÈÓK×PÑPÐQ[Ð]_ÐalÓmˆà)×.Ñ.¨z×;NÑ;NÐP_ÐacÓdˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ð`ˆà˜OÐ-?ÀÐOÐOùòI *sùò gs   Î<#YÐ2Yc                 óü  • UR                   u  pVpxUR                  XVXx5      nUcÄ  UR                   S S u  pY[        R                  " SUR                   S   S-   5      R	                  S5      R	                  S5      R                  XYS5      R                  UR                  5      n
X£R	                  S5      R                  XYS5      -
  n
[        U R                  U R                  U
S5      nU R                  U5      nUR                  UR                   S S U R                  U R                  4-   5      nUR                  SSSS5      nUR                  UR                   S S S-   5      nUR                  SU R                  S5      nUR                  SUR                   S   5      nUR                  5       nUR                  SUR!                  S5      5      n[        R"                  " USUS9nUR                  XVUS5      nU$ )	Nr%   r   rJ   r   Fr
   )rJ   ©r   Úindex)r¡   rÜ   r   ÚarangerK   rL   r  r#   rH   r?   r  r  rÌ   r  rð   r»   rM   Úgather)rj   r   rù   rN   rQ   rñ   rÌ   rò   rø   r/   rA   Úrel_pos_embeddingsr1  s                r    r  Ú=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsV  s  € ð 8D×7IÑ7IÑ4ˆ
 GØ#×(Ñ(¨ÀWÓVˆØ)Ñ1Ø*7×*=Ñ*=¸b¸qÐ*AÑ'ˆJä—’˜Q × 2Ñ 2°2Ñ 6¸Ñ :Ó;ß‘˜1“ß‘˜1“ß‘˜
°QÓ7ß‘L×'Ñ'Ó(ð ð "4×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐtuÓ6vÑ!vÐÜ-EØ× Ñ  $×"<Ñ"<Ð>PÐRWó.Ð*ð
 "×9Ñ9¸-ÓHÐØ/×4Ñ4Ø×$Ñ$ R aÐ(¨D×,<Ñ,<¸d×>QÑ>QÐ+RÑRó
Ðð 0×7Ñ7¸¸1¸aÀÓCÐà/×7Ñ7¸×8JÑ8JÈ2ÈAÐ8NÐQVÑ8VÓWÐà)G×)NÑ)NÈqÐRV×ReÑReÐghÓ)iÐ&à)G×)LÑ)LØÐ.×4Ñ4°RÑ8ó*
Ð&ð *H×)LÑ)LÓ)NÐ&à/×7Ñ7¸Ð<N×<SÑ<SÐTVÓ<WÓXÐä',§|¢|Ð4FÈAÐUsÑ'tÐ$Ø'C×'HÑ'HÈÐelÐnpÓ'qÐ$Ø+Ð+r!   c                 ó   • UR                   SS u  pVUcÈ  UR                   S   nUS   S   US-
  :X  d   S5       e[        R                  " SU5      R                  S5      R                  S5      R	                  XVS5      R                  UR                  5      nXƒR                  S5      R	                  XVS5      -
  n[        U R                  U R                  US5      nUR                  SS5      nU R                  U5      n	U	R                  UR                   S S U R                  U R                  4-   5      n	U	R                  SSSSS5      n	U	R                  SU R                  5      n	UR                  S5      nUR	                  U R                   SU R                  S5      nUR                  SUR#                  S5      5      R%                  5       n[        R&                  " U	SUS	9n
U
R                  XPR                   U R                  US5      n
U
$ )
Nr   r%   rJ   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr  r
   r@  )r¡   r   rB  rK   rL   r  r#   rH   r?   r  rÝ   r  rÜ   rÌ   r  rð   r0   rM   r»   rC  )rj   r   rù   rN   rR   rñ   r/   Úkey_sequence_lengthrA   rD  r;  s              r    r  Ú@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsƒ  s   € ð '4×&9Ñ&9¸!¸AÐ&>Ñ#ˆ
à,Ñ4Ø".×"4Ñ"4°RÑ"8ÐØ ‘? 1Ñ%Ð)<¸qÑ)@Ó@ð ØtóÐ@ô —’˜QÐ 3Ó4ß‘˜1“ß‘˜1“ß‘˜
°QÓ7ß‘L×'Ñ'Ó(ð ð "4×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐtuÓ6vÑ!vÐÜ0HØ× Ñ  $×"<Ñ"<Ð>PÐRWó1Ð-ð
 &×/Ñ/°°1Ó5ˆØ!×9Ñ9¸-ÓHÐð 0×4Ñ4Ø×Ñ  Ð$¨×(8Ñ(8¸$×:MÑ:MÐ'NÑNó
Ðð 0×7Ñ7¸¸1¸aÀÀAÓFÐà/×7Ñ7¸¸D×<LÑ<LÓMÐà,M×,WÑ,WÐXYÓ,ZÐ)Ø,M×,TÑ,TØJ‰J˜˜4×.Ñ.°ó-
Ð)ð -N×,RÑ,RØÐ1×6Ñ6°rÓ:ó-
ç
‰$‹&ð 	*ô +0¯,ª,Ø AÐ-Nñ+
Ð'ð
 +J×*NÑ*NØŸ
™
 D×$7Ñ$7¸È"ó+
Ð'ð /Ð.r!   )rÎ   rÏ   rÐ   rµ   rÑ   r0   rÌ   r?   r   rÔ   rÓ   r  r  rÒ   ©NNNNNNN)ro   rp   rq   rr   r   r´   rß   r  r   r   r   r¿   r  r  rw   rÈ   rÉ   s   @r    r  r  €  s_   ø† ð Ð/÷  ò:qòð 37ØØØ(,Ø'+Ø*.ØñpPð !  v¡Ñ/õpPòd+,÷Z9/ð 9/r!   r  c                   óH   ^ • \ rS rSrSrS\4U 4S jjr SS\4S jjrSr	U =r
$ )	ÚProphetNetEncoderLayeri¿  z
Encoder block for Prophetnet
r“   c                 óò   >• [         TU ]  5         [        XR                  5      U l        [        UR                  5      U l        [        XR                  5      U l
        [        UR                  5      U l        g r°   )r³   r´   rË   Únum_encoder_attention_headsÚ	self_attnr	   rµ   Úself_attn_layer_normr  Úencoder_ffn_dimÚfeed_forwardÚfeed_forward_layer_normr¶   s     €r    r´   ÚProphetNetEncoderLayer.__init__Ä  s[   ø€ Ü‰ÑÔä,¨V×5WÑ5WÓXˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ô 2°&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r!   rä   c                 ó¼   • U R                  UUUUS9u  pVnU R                  XQ-   5      nU R                  U5      nU R                  X-   5      nU4n	U(       a  X–4-  n	U	$ )N)r   rÁ   râ   rä   )rN  rO  rQ  rR  )
rj   r   rÁ   râ   rä   Úattention_outputrù   Ú_Úfeed_forward_outputÚoutputss
             r    r¿   ÚProphetNetEncoderLayer.forwardÎ  s€   € ð -1¯N©NØ'Ø)Ø+Ø/ð	 -;ð -
Ñ)Ð¨ð ×1Ñ1Ð2BÑ2RÓSˆð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÑ5XÓYˆà Ð"ˆæØÑ&ˆGàˆr!   )rQ  rR  rN  rO  ©F©ro   rp   rq   rr   rs   r   r´   rÿ   r¿   rw   rÈ   rÉ   s   @r    rK  rK  ¿  s0   ø† ñðEÐ/÷ Eð #(ñð
  ÷ó r!   rK  c                   ób   ^ • \ rS rSrSrS\4U 4S jjr            S	S\S\4S jjrSr	U =r
$ )
ÚProphetNetDecoderLayeriê  z
Decoder block for Prophetnet
r“   c                 óh  >• [         TU ]  5         [        U5      U l        [	        UR
                  5      U l        UR                  (       a4  [        XR                  5      U l
        [	        UR
                  5      U l        [        XR                  5      U l        [	        UR
                  5      U l        g r°   )r³   r´   r  rN  r	   rµ   rO  Úadd_cross_attentionrË   r  Ú
cross_attnÚcross_attn_layer_normr  Údecoder_ffn_dimrQ  rR  r¶   s     €r    r´   ÚProphetNetDecoderLayer.__init__ï  s…   ø€ Ü‰ÑÔä5°fÓ=ˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ð ×%×%Ü1°&×:\Ñ:\Ó]ˆDŒOÜ)2°6×3EÑ3EÓ)FˆDÔ&ô 2°&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r!   Ú	use_cacherä   c                 ó~  • Ub  US S OS nU R                  UUUUUUU	U
S9u  nnnnU R                  X-   5      nUb  USS  OS nS nUb1  U R                  UUUUUUS9u  nnnU R                  UU-   5      nUU-   nU R	                  U5      nU R                  UU-   5      nU4nU(       a  UUUU4-  nU(       a  UU4-  nU$ )Nr%   )r   rã   rÁ   râ   r   rQ   rR   rN   éþÿÿÿ)r   rá   rÁ   râ   rã   rä   )rN  rO  r`  ra  rQ  rR  )rj   r   rÁ   ra   Úencoder_attn_maskrâ   Úcross_attn_layer_head_maskr   rQ   rR   rN   rã   rd  rä   Úself_attn_past_key_valueÚngram_attention_outputÚself_attn_weightsÚself_attn_weights_ngramÚpresent_key_valueÚcross_attn_past_key_valueÚcross_attn_weightsrU  Úcross_attn_present_key_valuerW  rX  s                            r    r¿   ÚProphetNetDecoderLayer.forwardþ  sH  € ð$ :HÑ9S >°"°1Ñ#5ÐY]Ð Ø`d×`nÑ`nØ'Ø3Ø)Ø+Ø,KØ+IØ.OØ%ð aoð 	a
Ñ]ÐÐ 1Ð3JÐL]ð ×1Ñ1°-Ñ2XÓYˆð <JÑ;U N°2°3Ñ$7Ð[_Ð!Ø!ÐØ Ñ,àQU×Q`ÑQ`Ø+Ø!6Ø0Ø :Ø8Ø"3ð Rað RÑNÐÐ0Ð2Nð !×6Ñ6Ð7GÈ-Ñ7WÓXˆMð !2Ð4PÑ PÐð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÈ=Ñ5XÓYˆà Ð"ˆæØÐ)Ð+BÐDVÐWÑWˆGæØÐ)Ð+Ñ+ˆGàˆr!   )r`  ra  rQ  rR  rN  rO  )NNNNNNNNNNTFr[  rÉ   s   @r    r]  r]  ê  s[   ø† ñðEÐ/÷ Eð$ Ø"ØØØ#'Ø(,Ø'+Ø*.ØØØØ"'ñ=ð ð=ð  ÷=ó =r!   r]  z=
    The standalone encoder part of the ProphetNetModel.
    )Úcustom_introc                   ó"  ^ • \ rS rSrSS\S\R                  4U 4S jjjrS rS r	\
       SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ÚProphetNetEncoderi>  r“   Úword_embeddingsc                 ó¼  >• [         TU ]  U5        Ub  UO3[        R                  " UR                  UR
                  UR                  S9U l        [        U5      U l	        [        UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l        U R%                  5         gs  snf ©a  
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
    The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
    embeddings instead of randomly initialized word embeddings.
N©r˜   F)r³   r´   r   r—   Ú
vocab_sizerµ   rŸ   ru  r­   Úposition_embeddingsr	   Úembeddings_layer_normÚ
ModuleListr+   Únum_encoder_layersrK  ÚlayersÚgradient_checkpointingÚ	post_init©rj   r“   ru  rV  r·   s       €r    r´   ÚProphetNetEncoder.__init__D  s¸   ø€ ô 	‰Ñ˜Ô ð Ñ*ñ ä—’˜f×/Ñ/°×1CÑ1CÐQW×QdÑQdÑeð 	Ôô
 $BÀ&Ó#IˆÔ Ü%.¨v×/AÑ/AÓ%BˆÔ"ä—m’mÌUÐSY×SlÑSlÔMmÓ$nÒMmÈÔ%;¸FÖ%CÑMmÑ$nÓoˆŒà&+ˆÔ#à‰Õùò	 %os   Â Cc                 ó   • U R                   $ r°   ©ru  ri   s    r    Úget_input_embeddingsÚ&ProphetNetEncoder.get_input_embeddingsZ  ó   € Ø×#Ñ#Ð#r!   c                 ó   • Xl         g r°   r„  ©rj   Úvalues     r    Úset_input_embeddingsÚ&ProphetNetEncoder.set_input_embeddings]  ó   € Ø$Õr!   r¥   rÁ   Ú	head_maskÚinputs_embedsrä   Úoutput_hidden_statesÚreturn_dictr®   c                 ó’  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  Uc  [	        S5      eUb  Ub  [	        S5      eUb  Uc  U R                  U5      nUb}  SUSS2SSSS24   R                  SU R                   R                  SS5      -
  [        R                  " U R                  5      R                  -  nUR                  UR                  5      nOSnU R                  UR                  SS UR                  5      u  pšXI-   nU R!                  U5      n["        R$                  R'                  X°R                   R&                  U R(                  S9nU(       a  SOSnU(       a  SOSnUb\  UR+                  5       S	   [-        U R.                  5      :X  d2   S
[-        U R.                  5       SUR+                  5       S	    S35       e[1        U R.                  5       H  u  pïU(       a  XË4-   nU R2                  (       a8  U R(                  (       a'  U R5                  UR6                  UUUb  X>   OSU5      nOU" UUUb  X>   OSUS9nUS	   nU(       d  Mx  UUS   4-   nMƒ     U(       a  XË4-   nU(       d  [9        S X¼U4 5       5      $ [;        X¼US9$ )a¹  
Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetEncoder
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.ç      ð?r   r%   ré   rn   r   z&The head_mask should be specified for ú layers, but it is for Ú.)rÁ   râ   rä   c              3   ó.   #   • U  H  oc  M  Uv •  M     g 7fr°   rn   ©Ú.0Úvs     r    Ú	<genexpr>Ú,ProphetNetEncoder.forward.<locals>.<genexpr>»  s   é € ÐlÒ$Z˜qŸ™Ò$Zùó   ‚Œ	)r{   r   rƒ   )r“   rä   r  Úuse_return_dictrï   ru  rL   rM  r   r'   r   r(   r  rz  r¡   r#   r{  r   r   rÏ   rë   rM   Úlenr~  Ú	enumerater  Ú_gradient_checkpointing_funcÚ__call__Útupler   )rj   r¥   rÁ   rŽ  r  rä   r  r‘  Úextended_attention_maskrz  rN   r   ra   Úall_attentionsÚidxÚencoder_layerÚlayer_outputss                    r    r¿   ÚProphetNetEncoder.forward`  sÜ  € ð4 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆàÑ Ñ!6ÜÐRÓSÐSØÑ" }Ñ'@ÜÐQÓRÐRØÑ" }Ñ'<Ø ×0Ñ0°Ó;ˆMð Ñ%àn¢Q¨¨d²AÐ%5Ñ6×=Ñ=¸aÀÇÁ×AhÑAhÐjkÐmnÓoÑoÜ—’˜DŸJ™JÓ'×+Ñ+ñ',Ð#ð '>×&@Ñ&@À×ATÑATÓ&UÑ#à&*Ð#à,0×,DÑ,DÀ]×EXÑEXÐY[ÐZ[ÐE\Ð^k×^rÑ^rÓ,sÑ)Ðà%Ñ;ˆØ×2Ñ2°=ÓAˆÜŸ™×-Ñ-¨m¿{¹{×?RÑ?RÐ]a×]jÑ]jÐ-Ðkˆæ&:¡ÀÐÞ0™°dˆð Ñ Ø—>‘>Ó# AÑ&¬3¨t¯{©{Ó+;Ó<ð Ø8¼¸T¿[¹[Ó9IÐ8JÐJaÐbk×bpÑbpÓbrÐstÑbuÐavÐvwÐxóÐ<ô #,¨D¯K©KÖ"8ÑˆCÞ#Ø(=Ð@PÑ(PÐ%à×*×*¨t¯}¯}Ø $× AÑ AØ!×*Ñ*Ø!Ø+Ø'0Ñ'<Y’^À$Ø%ó!‘ñ !.Ø!Ø#:Ø7@Ñ7L Y¢^ÐRVØ&7ñ	!ð *¨!Ñ,ˆMç Ð Ø!/°=ÀÑ3CÐ2EÑ!E’ñ/ #9ö2  Ø$9Ð<LÑ$LÐ!æÜÑl ]È>Ñ$ZÓlÓlÐlÜØ+Ð]kñ
ð 	
r!   )r{  r  r~  rz  ru  r°   rI  )ro   rp   rq   rr   r   r   r—   r´   r…  r‹  r   r   r   r   rÿ   r   r   r   r¿   rw   rÈ   rÉ   s   @r    rt  rt  >  sç   ø† ñÐ/ð À"Ç,Á,÷ ð ò,$ò%ð ð -1Ø15Ø,0Ø04Ø,0Ø/3Ø&*ñ]
à˜EŸL™LÑ)ð]
ð ! §¡Ñ.ð]
ð ˜EŸL™LÑ)ð	]
ð
   §¡Ñ-ð]
ð $ D™>ð]
ð ' t™nð]
ð ˜d‘^ð]
ð 
ˆuoÐ%Ñ	&ô]
ó ö]
r!   rt  z=
    The standalone decoder part of the ProphetNetModel.
    c                   óÒ  ^ • \ rS rSrSS\S\\R                     4U 4S jjjrS r	S r
\            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rS rS rS rSrU =r$ )ÚProphetNetDecoderiÁ  r“   ru  c                 óÈ  >• [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        Ub  UO3[        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        [        R                  " U R                  UR                  S5      U l        [        R"                  " [%        UR&                  5       Vs/ s H  n[)        U5      PM     sn5      U l        [-        UR                  5      U l        SU l        U R3                  5         gs  snf rw  )r³   r´   r0   r?   r  rÏ   r±   Úmax_target_positionsr   r—   ry  rµ   rŸ   ru  r­   rz  Úngram_embeddingsr|  r+   Únum_decoder_layersr]  r~  r	   r{  r  r€  r  s       €r    r´   ÚProphetNetDecoder.__init__Ç  s  ø€ ô 	‰Ñ˜Ô à—\‘\ˆŒ
Ø!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø—~‘~ˆŒØ$*×$BÑ$BˆÔ!ð Ñ*ñ ä—’˜f×/Ñ/°×1CÑ1CÐQW×QdÑQdÑeð 	Ôô
 $BÀ&Ó#IˆÔ ä "§¢¨T¯Z©Z¸×9KÑ9KÈTÓ RˆÔÜ—m’mÌUÐSY×SlÑSlÔMmÓ$nÒMmÈÔ%;¸FÖ%CÑMmÑ$nÓoˆŒÜ%.¨v×/AÑ/AÓ%BˆÔ"à&+ˆÔ#à‰Õùò %os   ÄEc                 ó   • U R                   $ r°   r„  ri   s    r    r…  Ú&ProphetNetDecoder.get_input_embeddingsä  r‡  r!   c                 ó   • Xl         g r°   r„  r‰  s     r    r‹  Ú&ProphetNetDecoder.set_input_embeddingsç  r  r!   r¥   rÁ   ra   Úencoder_attention_maskrŽ  Úcross_attn_head_maskrZ   r  rd  rä   r  r‘  r®   c                 óò  • U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  Uc  [        S5      eUb  Ub  [        S5      eUb  Uc  U R                  U5      nUR                  SS u  pÞU R                  XÞ4UR                  US9u  nnUb  Su  nnOU R                  U5      u  nnU R                  R                  US-   5      nX-   nU R                  R                  nUb`  UR                  S5      S:X  d   S5       e[        U R                   5       Vs/ s H  nUUS-
     U-   R#                  USS5      PM!     nnSnSnOR[        U R                   5       Vs/ s H  nUUS-
     U-   PM     nnU R%                  UU5      nU R'                  UU5      nUb}  S	USS2SSSS24   R#                  SU R                   R(                  SS5      -
  [*        R,                  " U R.                  5      R0                  -  nUR3                  UR.                  5      nOSn[*        R4                  " U/U-   S5      nU R6                  (       a  U R7                  U5      n[8        R:                  R=                  UU R<                  U R>                  S
9nU(       a  SOSnU(       a  U R                   R                   S:”  a  SOSnU
(       a  SOSnU
(       a  SOSnU
(       a  U R                   R@                  (       a  SOSnU RB                  (       a/  U R>                  (       a  U	(       a  [D        RG                  S5        Sn	U	(       a  SOSn [I        XV/SS/5       Hj  u  n!n"U!c  M  U!R                  5       S   [K        U RL                  5      :X  a  M7   SU" S[K        U RL                  5       SUR                  5       S    S35       e   [O        U RL                  5       GH-  u  n#n$U(       a8  UUSS2SU24   4-  nU R                   R                   S:”  a  UUSS2US24   4-  nUb  UU#   OSn%U RB                  (       aJ  U R>                  (       a9  U RQ                  U$RR                  UUUUUb  UU#   OSUb  UU#   OSUUUUSU	U
5      n&O"U$" UUUUUb  UU#   OSUb  UU#   OSUUUUU%U	U
S9n&U&S   nU	(       a  U U&U
(       a  SOS   4-  n U
(       d  Mô  UU&S   4-  nUU&S   4-  nU R                   R@                  (       d  GM$  UU&S   4-  nGM0     U(       a8  UUSS2SU24   4-  nU R                   R                   S:”  a  UUSS2US24   4-  nUSS2SU24   n'U R                   R                   S:”  a  USS2US24   OSn(U(       d  [U        S U'U(U UUUUU4 5       5      $ [W        U'U(U UUUUUS9$ s  snf s  snf )aé  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetDecoder
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r%   )r#   rZ   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1r“  ré   rn   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrŽ  rµ  zThe `z` should be specified for r”  r•  )rÁ   ra   rg  râ   rh  r   rQ   rR   rN   rã   rd  rä   r  r
   c              3   ó0   #   • U  H  nUc  M  Uv •  M     g 7fr°   rn   r—  s     r    rš  Ú,ProphetNetDecoder.forward.<locals>.<genexpr>°  s"   é € ð ò	Að ÷ ‘ò	ùs   ‚	)r{   r|   rZ   r   r‚   rƒ   r„   r_   ),r“   rd  rä   r  r  rï   ru  r¡   rz  r#   Ú!compute_buffered_relative_bucketsrÆ   r­  r   rM   r+   r0   rL   Úprepare_attention_maskÚprepare_predict_attention_maskr  r   r'   r   r(   r  r.   r{  r   r   rÏ   rë   r_  r  ÚloggerÚwarning_onceÚziprž  r~  rŸ  r   r¡  r¢  r€   ))rj   r¥   rÁ   ra   r´  rŽ  rµ  rZ   r  rd  rä   r  r‘  rñ   r/   Úmain_stream_pos_embedrN   rQ   rR   Úpredicting_stream_pos_embedr   r­  r0   Úngram_hidden_statesr£  r   Úextended_encoder_attention_maskÚall_main_stream_hidden_statesÚall_ngram_stream_hidden_statesÚall_main_stream_attnsÚall_ngram_stream_attnsÚall_cross_attnsÚpresent_key_valuesÚ	attn_maskÚ	mask_namer¥  Údecoder_layerrã   r§  r{   r|   s)                                            r    r¿   ÚProphetNetDecoder.forwardê  sÏ  € ðH "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆàÑ Ñ!6ÜÐfÓgÐgØÑ" }Ñ'@ÜÐeÓfÐfØÑ" }Ñ'<Ø ×0Ñ0°Ó;ˆMà&3×&9Ñ&9¸"¸1Ð&=Ñ#ˆ
à.2×.FÑ.FØÐ)Ø ×'Ñ'Ø+ð /Gð /
Ñ+Ð˜|ð Ñ&ØPZÑMÐ*Ñ,Mð
 ×6Ñ6°|ÓDñØ.Ø1à&*×&>Ñ&>×&GÑ&GÈÐWXÑHXÓ&YÐ#ð &Ñ=ˆà×0Ñ0×7Ñ7Ðð Ñ&Ø ×%Ñ% aÓ(¨AÓ-ð ØaóÐ-ô # 4§:¡:Ô.ó#â.Eð " %¨!¡)Ñ,Ð/JÑJ×RÑRÐS]Ð_`ÐbcÖdÙ.ð  ð #ð '+Ð#Ø.2Ñ+ô Z_Ð_c×_iÑ_iÔYjó#ÚYjÐPUÐ! %¨!¡)Ñ,Ð/JÔJÑYjð  ð #ð '+×&AÑ&AÀ-ÐQ_Ó&`Ð#Ø.2×.QÑ.QÐR_ÐaoÓ.pÐ+ð "Ñ-àÐ,ªQ°°dºAÐ-=Ñ>×EÑEÀaÈÏÉ×IpÑIpÐrsÐuvÓwÑwÜ—’˜DŸJ™JÓ'×+Ñ+ñ/,Ð+ð /N×.PÑ.PÐQ^×QdÑQdÓ.eÑ+à.2Ð+äŸ	š	 = /Ð4GÑ"GÈÓKˆà×%×%Ø ×6Ñ6°}ÓEˆMäŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-Ðdˆö /C©ÈÐ%Þ/CÈÏÉ×HYÑHYÐ\]ÓH]©ÐcgÐ&æ&7¡¸TÐÞ'8¡¸dÐÞ 1°d·k±k×6U×6U™"Ð[_ˆà×&×&¨4¯=¯=ÞÜ×#Ñ#Øpôð "	æ#,™R°$Ðô %(¨Ð(IÈKÐYoÐKpÖ$qÑ ˆIyØÓ$Ø —~‘~Ó'¨Ñ*¬s°4·;±;Ó/?Õ@ð Ø˜I˜;Ð&@ÄÀTÇ[Á[ÓAQÐ@Rð SØ!Ÿ™Ó(¨Ñ+Ð,¨Að/óÐ@ñ %rô #,¨D¯K©K×"8ÑˆCÞ#à-°-ÂÐCSÀOÐCSÐ@SÑ2TÐ1VÑVÐ-Ø—;‘;×$Ñ$ qÓ(Ø2°}ÂQÈÑHXÐEXÑ7YÐ6[Ñ[Ð2à5DÑ5P˜_¨SÒ1ÐVZˆNà×*×*¨t¯}¯}Ø $× AÑ AØ!×*Ñ*Ø!Ø+Ø)Ø3Ø'0Ñ'<Y˜s’^À$Ø2FÑ2RÐ)¨#Ò.ÐX\Ø3Ø2Ø5Ø ØØØ%ó!‘ñ" !.Ø!Ø#:Ø*?Ø&EØ7@Ñ7L Y¨s¢^ÐRVà5IÑ5UÐ,¨SÒ1Ð[_à4SØ3QØ6WØ!-Ø#1Ø'Ø&7ñ!ð$ *¨!Ñ,ˆMæØ" }Ö:K±QÐQRÑ'SÐ&UÑUÐ"ç Ð Ø%¨-¸Ñ*:Ð)<Ñ<Ð%Ø&¨=¸Ñ+;Ð*=Ñ=Ð&à—;‘;×2×2Ò2Ø#¨°aÑ(8Ð':Ñ:“Oño #9ör  Ø)¨mºAÐ?OÀÐ?OÐ<OÑ.PÐ-RÑRÐ)Ø{‰{× Ñ  1Ó$Ø.°=ÂÀOÑDTÐATÑ3UÐ2WÑWÐ.ð *ª!Ð-=¨oÐ-=Ð*=Ñ>ÐØHLÏÉ×HYÑHYÐ\]ÓH] -²°?Ñ3CÐ0CÒ"DÐcgÐæÜñ ð &Ø+Ø&Ø1Ø2Ø)Ø*Ø#ñ	óó ð ô ,Ø/Ø$;Ø.Ø7Ø >Ø,Ø3Ø,ñ	
ð 		
ùòM#ùò#s   Å!&W/Æ%W4c           	      óî  • UR                   u  p#[        R                  " SU R                  5      R	                  UR
                  5      R                  SS5      n[        U R                  U R                  U5      u  pEUS S 2S U2S U24   R                  USS5      n[        R                  " US S 2S U2S U24   US S 2S U2U R                  U R                  U-   24   /S5      R                  USS5      nXE4$ rÚ   )r¡   r   rB  r¬  r  r#   rL   rS   r?   r  r.   )rj   rN   rñ   r/   Úmain_relative_bucketsÚpredict_relative_bucketss         r    r¹  Ú3ProphetNetDecoder.compute_buffered_relative_bucketsÉ  s  € Ø&2×&8Ñ&8Ñ#ˆ
ä—|’| A t×'@Ñ'@ÓA×DÑDÀ\×EXÑEXÓY×`Ñ`ÐabÐdeÓfˆÜ:]Ø×Ñ˜d×8Ñ8¸,ó;
Ñ7Ðð
 !6²aÐ9I¸/Ð9IÐK[ÈOÐK[Ð6[Ñ \× cÑ cÐdnÐpqÐstÓ uÐÜ#(§9¢9à(ªÐ,<¨_Ð,<Ð>N¸Ð>NÐ)NÑOØ(ÚÐ'˜Ð'¨×)BÑ)BÀT×E^ÑE^ÐapÑEpÐ)pÐpñðð ó$
÷ ‰&˜Q Ó
"ð 	!ð %Ð>Ð>r!   c                 óH  • UR                   S S u  p4[        R                  " XD4[        R                  " UR                  5      R
                  UR                  UR                  S9n[        R                  " US5      nUS U2S U24   S S S S 2S S 24   R                  X0R                  R                  4UR                   -   5      nUb@  SUS S 2S S S S 24   -
  [        R                  " U R                  5      R
                  -  nXg-   nOUnUR                  UR                  5      $ )Nr%   rº   r   r“  )r¡   r   Úfullr'   r   r(   r#   ÚtriuÚexpandr“   r  r  )rj   r   rÁ   rñ   Ú
seq_lengthÚcausal_maskÚextended_causal_maskr£  s           r    rº  Ú(ProphetNetDecoder.prepare_attention_maskß  s  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
ô —j’jØÐ$ÜKŠK˜×+Ñ+Ó,×0Ñ0Ø×%Ñ%Ø ×'Ñ'ñ	
ˆô —j’j ¨aÓ0ˆà*¨;¨J¨;¸¸¸Ð+CÑDÀTÈ4ÒQRÒTUÐEUÑV×]Ñ]ØŸ™×@Ñ@ÐAÀK×DUÑDUÑUó 
Ðð
 Ñ%Ø'*¨^ºA¸tÀTÊ1Ð<LÑ-MÑ'MÔQV×Q\ÒQ\Ð]a×]gÑ]gÓQh×QlÑQlÑ&lÐ#Ø&:Ñ&TÑ#à&:Ð#Ø&×)Ñ)¨-×*=Ñ*=Ó>Ð>r!   c           	      ó  • UR                   S S u  p4[        U R                  U R                  UR                  UR
                  5      n[        R                  " US S 2S U2S U24   US S 2S U2U R                  U R                  U-   24   /SS9nUS S S S 2S S 2S S 24   R                  X0R                  R                  4UR                   -   5      nUbž  SUS S 2S S S S S 24   -
  [        R                  " U R
                  5      R                  -  nUR                  X0R                  R                  U R                  XD45      n[        R                  " U[        R                  " U5      /SS9nXg-   nOUnUR                  UR
                  5      $ )Nr%   rJ   r   r“  )r¡   r4   r¬  r0   r#   r   r   r.   rÔ  r“   r  r'   r(   r7   r  )	rj   r   rÁ   rñ   rÕ  Úpredict_causal_maskÚextended_predict_causal_maskr£  r   s	            r    r»  Ú0ProphetNetDecoder.prepare_predict_attention_mask÷  sž  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
ô 3Ø×%Ñ% t§z¡z°=×3GÑ3GÈ×I\ÑI\ó
Ðô $Ÿišià#¢A {¨
 {°K°Z°KÐ$?Ñ@Ø#Ú{˜
{ D×$=Ñ$=À×@YÑ@YÐ\fÑ@fÐ$fÐfñðð ñ
Ðð (;¸4ÀÂqÊ!ÊQÐ;NÑ'O×'VÑ'VØŸ™×@Ñ@ÐAÐDW×D]ÑD]Ñ]ó(
Ð$ð
 Ñ%Ø'*¨^ºA¸tÀTÈ4ÒQRÐ<RÑ-SÑ'SÔW\×WbÒWbÐcg×cmÑcmÓWn×WrÑWrÑ&rÐ#Ø&=×&DÑ&DØŸ[™[×DÑDÀdÇjÁjÐR\Ðió'Ð#ô ',§i¢iØ(¬%×*:Ò*:Ð;RÓ*SÐTÐZ\ñ'Ð#ð /KÑ.dÑ+à.JÐ+Ø.×1Ñ1°-×2EÑ2EÓFÐFr!   )rÏ   r{  r  r~  r¬  r0   r­  r?   rz  r  ru  r°   )NNNNNNNNNNNN)ro   rp   rq   rr   r   r   r   r—   r´   r…  r‹  r   r   r   r   rÿ   r   r€   r¿   r¹  rº  r»  rw   rÈ   rÉ   s   @r    rª  rª  Á  s|  ø† ñÐ/ð À(È2Ï<É<ÑBX÷ ð ò:$ò%ð ð -1Ø15Ø8<Ø9=Ø,0Ø7;Ø@DØ04Ø$(Ø,0Ø/3Ø&*ñ\
à˜EŸL™LÑ)ð\
ð ! §¡Ñ.ð\
ð  (¨¯©Ñ5ð	\
ð
 !)¨¯©Ñ 6ð\
ð ˜EŸL™LÑ)ð\
ð ' u§|¡|Ñ4ð\
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ð\
ð   §¡Ñ-ð\
ð ˜D‘>ð\
ð $ D™>ð\
ð ' t™nð\
ð ˜d‘^ð\
ð 
ˆuÐ2Ð2Ñ	3ô\
ó ð\
ò|?ò,?÷0!Gð !Gr!   rª  c            $       ó  ^ • \ rS rSrSS/rS\4U 4S jjrS rS rS r	S	 r
S
 r\               SS\\R                     S\\R                     S\\R                     S\\R                      S\\R                     S\\R                     S\\R                     S\\   S\\\\R                           S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4 S jj5       rSrU =r$ )ÚProphetNetModeli  úencoder.word_embeddings.weightúdecoder.word_embeddings.weightr“   c                 ó®  >• [         TU ]  U5        [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " U5      nSUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl        SUl
        [        X0R                  5      U l        U R#                  5         g )Nrx  FT)r³   r´   r   r—   ry  rµ   rŸ   ru  ÚcopyÚdeepcopyÚis_encoder_decoderrd  rt  ÚencoderÚ
is_decoderrª  Údecoderr€  )rj   r“   Úencoder_configÚdecoder_configr·   s       €r    r´   ÚProphetNetModel.__init__  s¤   ø€ Ü‰Ñ˜Ô Ü!Ÿ|š|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÑsˆÔäŸš vÓ.ˆØ,1ˆÔ)Ø#(ˆÔ Ü(¨×9MÑ9MÓNˆŒäŸš vÓ.ˆØ$(ˆÔ!Ø,1ˆÔ)Ü(¨×9MÑ9MÓNˆŒð 	‰Õr!   c                 ó   • U R                   $ r°   r„  ri   s    r    r…  Ú$ProphetNetModel.get_input_embeddings0  r‡  r!   c                 ó|   • Xl         U R                   U R                  l         U R                   U R                  l         g r°   )ru  rå  rç  r‰  s     r    r‹  Ú$ProphetNetModel.set_input_embeddings3  s,   € Ø$ÔØ'+×';Ñ';ˆ‰Ô$Ø'+×';Ñ';ˆ‰Õ$r!   c                 óü   • U R                   R                  (       aa  U R                  U R                  R                  U R                  5        U R                  U R
                  R                  U R                  5        g g r°   )r“   Útie_word_embeddingsÚ_tie_or_clone_weightsrå  ru  rç  ri   s    r    Ú_tie_weightsÚProphetNetModel._tie_weights8  sT   € Ø;‰;×*×*Ø×&Ñ& t§|¡|×'CÑ'CÀT×EYÑEYÔZØ×&Ñ& t§|¡|×'CÑ'CÀT×EYÑEYÕZð +r!   c                 ó   • U R                   $ r°   )rå  ri   s    r    Úget_encoderÚProphetNetModel.get_encoder=  ó   € Ø|‰|Ðr!   c                 ó   • U R                   $ r°   ©rç  ri   s    r    Úget_decoderÚProphetNetModel.get_decoder@  r÷  r!   r¥   rÁ   Údecoder_input_idsÚdecoder_attention_maskrŽ  Údecoder_head_maskrµ  Úencoder_outputsrZ   r  Údecoder_inputs_embedsrd  rä   r  r‘  r®   c                 ó^  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUUU
UUUS9nU R                  UUUS   UUUU	UUUUUS9nU(       d  UU-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )aW  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetModel

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

>>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
>>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
```)r¥   rÁ   rŽ  r  rä   r  r‘  r   )r¥   rÁ   ra   r´  rŽ  rµ  rZ   r  rä   r  rd  r‘  )r{   r|   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )r“   rd  rä   r  r  rå  rç  ry   r{   r|   rZ   r   r‚   rƒ   r„   r_   )rj   r¥   rÁ   rü  rý  rŽ  rþ  rµ  rÿ  rZ   r  r   rd  rä   r  r‘  Údecoder_outputss                    r    r¿   ÚProphetNetModel.forwardC  sT  € ðr "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆàÑ"Ø"Ÿl™lØ#Ø-Ø#Ø+Ø"3Ø%9Ø'ð +ð ˆOð Ÿ,™,Ø'Ø1Ø"1°!Ñ"4Ø#1Ø'Ø!5Ø+Ø/Ø/Ø!5ØØ#ð 'ð 
ˆö Ø" _Ñ4Ð4Ü+Ø-×?Ñ?Ø$3×$KÑ$KØ+×;Ñ;Ø"1×"?Ñ"?Ø(7×(KÑ(KØ.×9Ñ9Ø%4×%EÑ%EØ,×=Ñ=Ø&5×&GÑ&GØ"1×"?Ñ"?Ø.×9Ñ9ñ
ð 	
r!   )rç  rå  ru  )NNNNNNNNNNNNNNN)ro   rp   rq   rr   Ú_tied_weights_keysr   r´   r…  r‹  rò  rõ  rú  r   r   r   r   Ú
BoolTensorr   rÿ   r   ry   r¿   rw   rÈ   rÉ   s   @r    rÞ  rÞ    s®  ø† à:Ð<\Ð]ÐðÐ/÷ ò"$ò<ò
[ò
òð ð -1Ø15Ø48Ø=AØ,0Ø48Ø7;Ø+/Ø@DØ04Ø8<Ø$(Ø,0Ø/3Ø&*ñ!h
à˜EŸL™LÑ)ðh
ð ! §¡Ñ.ðh
ð $ E§L¡LÑ1ð	h
ð
 !)¨×)9Ñ)9Ñ :ðh
ð ˜EŸL™LÑ)ðh
ð $ E§L¡LÑ1ðh
ð ' u§|¡|Ñ4ðh
ð " %™ðh
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðh
ð   §¡Ñ-ðh
ð  (¨¯©Ñ5ðh
ð ˜D‘>ðh
ð $ D™>ðh
ð ' t™nðh
ð  ˜d‘^ð!h
ð" 
ˆuÐ2Ð2Ñ	3ô#h
ó öh
r!   rÞ  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            &       óz  ^ • \ rS rSr/ SQrS\4U 4S jjrS rS rS r	S r
\                S!S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4"S jj5       rS"S jrS\R                  4S jr\S 5       rS rS rS rU =r$ )#Ú"ProphetNetForConditionalGenerationi¯  )rß  rà  úlm_head.weightr“   c                 ó  >• [         TU ]  U5        [        U5      U l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  SS9U l        U R                  5         g )NF©r•   )r³   r´   rÞ  r‰   rŸ   r˜   Údisable_ngram_lossr   r   rµ   ry  Úlm_headr€  r¶   s     €r    r´   Ú+ProphetNetForConditionalGeneration.__init__·  sd   ø€ Ü‰Ñ˜Ô Ü)¨&Ó1ˆŒØ!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y’y ×!3Ñ!3°V×5FÑ5FÈUÑSˆŒð 	‰Õr!   c                 ó   • U R                   $ r°   ©r  ri   s    r    Úget_output_embeddingsÚ8ProphetNetForConditionalGeneration.get_output_embeddingsÂ  r÷  r!   c                 ó   • Xl         g r°   r  ©rj   Únew_embeddingss     r    Úset_output_embeddingsÚ8ProphetNetForConditionalGeneration.set_output_embeddingsÅ  ó   € Ø%r!   c                 óœ   • U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r°   )r“   rð  rñ  r‰   ru  r  ri   s    r    rò  Ú/ProphetNetForConditionalGeneration._tie_weightsÈ  s2   € Ø;‰;×*×*Ø×&Ñ& t§¡×'FÑ'FÈÏÉÕUð +r!   c                 ó.   • U R                   R                  $ r°   )r‰   ru  ri   s    r    r…  Ú7ProphetNetForConditionalGeneration.get_input_embeddingsÌ  s   € Ø‰×.Ñ.Ð.r!   r¥   rÁ   rü  rý  rŽ  rþ  rµ  rÿ  rZ   r  r   Úlabelsrd  rä   r  r‘  r®   c                 óx  • Ub  UOU R                   R                  nUb  Uc  Uc  U R                  U5      nU R                  UUUUUUUUU	U
UUUUUS9nUb  UR                  OUR                  SS u  nnUS   R                  UU R                   R                  US5      nU R                  U5      nUSS2S4   nU R                   R                  S:”  a  USS2SS24   OSnUR                  5       (       d  UR                  5       nSnUb  U R                  UU5      nU(       d+  [        S UU4 5       5      nUb  U4U-   USS -   $ UUSS -   $ [        UUUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9$ )	aË  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

>>> logits_next_token = outputs.logits  # logits to predict next token as usual
>>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
```N)r¥   rÁ   rü  rý  rŽ  rþ  rµ  rÿ  rZ   r  r   rd  rä   r  r‘  r%   r   rJ   r   c              3   ó.   #   • U  H  oc  M  Uv •  M     g 7fr°   rn   r—  s     r    rš  Ú=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>7  ó   é € ÐRÒ*@ QŸq™qÒ*@ùrœ  )rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )r“   r  r§   r‰   r¡   rÜ   r0   r  Úis_contiguousrÞ   Ú_compute_lossr¢  rU   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )rj   r¥   rÁ   rü  rý  rŽ  rþ  rµ  rÿ  rZ   r  r   r  rd  rä   r  r‘  rX  rñ   r/   Úpredicting_streamsÚpredict_logitsrX   rY   rW   Ú
all_logitss                             r    r¿   Ú*ProphetNetForConditionalGeneration.forwardÏ  sü  € ð| &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆàÑÐ"3Ñ";Ð@UÑ@]à $× 1Ñ 1°&Ó 9Ðà—/‘/ØØ)Ø/Ø#9ØØ/Ø!5Ø+Ø+Ø'Ø"7ØØ/Ø!5Ø#ð "ð 
ˆð$ (9Ñ'DÐ×#Ò#ÐJ_×JeÑJeÐfhÐghÐJiñ 	$ˆ
Oð % Q™ZŸ_™_¨Z¸¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÓ0E~¢a¨© eÒ,È4ˆð ×#Ñ#×%Ñ%Ø×&Ñ&Ó(ˆFàˆØÑØ×%Ñ% n°fÓ=ˆDæÜÑR¨6°<Ñ*@ÓRÓRˆJØ9=Ñ9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø&-×&CÑ&CØ,3×,OÑ,OØ#*×#=Ñ#=Ø)0×)IÑ)IØ!(×!9Ñ!9Ø*1×*KÑ*KØ&-×&CÑ&CØ#*×#=Ñ#=ñð r!   c                 óÄ  • UR                  U R                  R                  UR                  S5      UR                  S5      5      R	                  U5      n[        U R                  R                  5       H'  nUS:”  a  U R                  (       a    OX$US S 2S S 24'   M)     UR                  SS5      R                  5       n[        R                  R                  UR                  SUR                  S5      5      S[        R                  S9n[        R                  R                  XdR                  S5      SS9nU R                  R                   S:”  aŽ  UR#                  SSS	9* nUR%                  U5      R                  S5      n	X‰   nUR'                  5       nU R                  R                   UR                  S5      -  n
S
U R                  R                   -
  U-  X¨-  -   nU$ ©Nr   r   rJ   r   rŒ   )Ú	reductionr‹   T)r   Úkeepdimr“  ©r    r“   r0   rM   Úfill_r+   r  rÝ   rÞ   r   r   Úlog_softmaxrÜ   r   r   Únll_lossÚepsÚsumÚnerŒ   ©rj   rX   r  Úignore_indexÚexpend_targetsÚiÚlprobsrW   Úsmooth_lossÚnon_masked_tokensÚeps_is              r    r"  Ú0ProphetNetForConditionalGeneration._compute_lossI  óˆ  € Ø×)Ñ)¨$¯+©+×*;Ñ*;¸V¿[¹[È»^ÈVÏ[É[ÐYZË^Ó\×bÑbÐcoÓpˆät—{‘{×(Ñ(Ö)ˆAØ1‹u˜×0×0ÙØ&,˜1ša¢˜7Ó#ñ *ð
 ×!Ñ! ! QÓ'×2Ñ2Ó4ˆÜ—‘×*Ñ*ØK‰K˜˜FŸK™K¨›OÓ,ØÜ—-‘-ð +ð 
ˆô }‰}×%Ñ% f×.AÑ.AÀ"Ó.EÐQWÐ%ÐXˆà;‰;?‰?˜SÓ Ø!Ÿ:™:¨"°d˜:Ð;Ð;ˆKØ .× 1Ñ 1°,Ó ?× DÑ DÀRÓ HÐØ%Ñ8ˆKØ%×*Ñ*Ó,ˆKà—K‘K—O‘O f§k¡k°"£oÑ5ˆEØ˜$Ÿ+™+Ÿ/™/Ñ)¨TÑ1°EÑ4GÑGˆDàˆr!   c                 ó$   • U R                  U5      $ r°   )r§   )rj   r  s     r    Ú%prepare_decoder_input_ids_from_labelsÚHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelse  s   € Ø× Ñ  Ó(Ð(r!   c                 ób   ^• SnU  H%  nU[        U4S jUS S  5       5      USS  -   4-  nM'     U$ )Nrn   c              3   óx   >#   • U  H/  oR                  S TR                  UR                  5      5      v •  M1     g7f©r   N©Úindex_selectr  r#   ©r˜  Ú
past_stateÚbeam_idxs     €r    rš  ÚDProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>o  s1   øé € ÐrÒcqÐU_×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÐPÒcqùó   ƒ7:r%   ©r¢  ©rZ   rF  Úreordered_pastÚ
layer_pasts    `  r    Ú_reorder_cacheÚ1ProphetNetForConditionalGeneration._reorder_cacheh  sQ   ø€ ð ˆÛ)ˆJàÜÔrÐcmÐnpÐopÑcqÓrÓrØ˜Q˜R.ñ!ðñ ŠNñ *ð Ðr!   c                 ó.   • U R                   R                  $ r°   )r‰   rå  ri   s    r    rõ  Ú.ProphetNetForConditionalGeneration.get_encodert  ó   € Ø‰×&Ñ&Ð&r!   c                 ó.   • U R                   R                  $ r°   ©r‰   rç  ri   s    r    rú  Ú.ProphetNetForConditionalGeneration.get_decoderw  rQ  r!   ©r  r  r˜   r‰   )NNNNNNNNNNNNNNNN©r   )ro   rp   rq   rr   r  r   r´   r  r  rò  r…  r   r   r   r   r  r   rÿ   r   rU   r¿   r"  r=  ÚstaticmethodrM  rõ  rú  rw   rÈ   rÉ   s   @r    r  r  ¯  sû  ø† ò pÐð	Ð/÷ 	òò&òVò/ð ð -1Ø15Ø48Ø=AØ,0Ø48Ø7;Ø26Ø@DØ04Ø8<Ø)-Ø$(Ø,0Ø/3Ø&*ñ#wà˜EŸL™LÑ)ðwð ! §¡Ñ.ðwð $ E§L¡LÑ1ð	wð
 !)¨×)9Ñ)9Ñ :ðwð ˜EŸL™LÑ)ðwð $ E§L¡LÑ1ðwð ' u§|¡|Ñ4ðwð " %§,¡,Ñ/ðwð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðwð   §¡Ñ-ðwð  (¨¯©Ñ5ðwð ˜Ÿ™Ñ&ðwð ˜D‘>ðwð $ D™>ðwð  ' t™nð!wð" ˜d‘^ð#wð$ 
ˆuÐ/Ð/Ñ	0ô%wó ðwôrð8)¸E¿L¹Lô )ð ñó ðò'÷'ð 'r!   r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                    ó  ^ • \ rS rSr/ SQrS\4U 4S jjrS rS rS r	S r
S	 rS
 rS r\             SS\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\\\R"                           S\\R"                     S\\R"                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rS S jr    S!S jr\S 5       rSrU =r$ )"ÚProphetNetForCausalLMi{  )z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r“   c                 óN  >• [         R                  " U5      nSUl        SUl        [        TU ]  U5        [        U5      U l        UR                  U l	        UR                  U l
        [        R                  " UR                  UR                  SS9U l        U R!                  5         g )NTFr
  )râ  rã  ræ  rä  r³   r´   ÚProphetNetDecoderWrapperr‰   rŸ   r˜   r  r   r   rµ   ry  r  r€  r¶   s     €r    r´   ÚProphetNetForCausalLM.__init__‡  s‚   ø€ ä—’˜vÓ&ˆØ ˆÔØ$)ˆÔ!Ü‰Ñ˜Ô Ü2°6Ó:ˆŒà!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y’y ×!3Ñ!3°V×5FÑ5FÈUÑSˆŒð 	‰Õr!   c                 óB   • U R                   R                  R                  $ r°   ©r‰   rç  ru  ri   s    r    r…  Ú*ProphetNetForCausalLM.get_input_embeddings—  s   € Ø‰×&Ñ&×6Ñ6Ð6r!   c                 ó8   • XR                   R                  l        g r°   r^  r‰  s     r    r‹  Ú*ProphetNetForCausalLM.set_input_embeddingsš  s   € Ø27‰×ÑÕ/r!   c                 ó   • U R                   $ r°   r  ri   s    r    r  Ú+ProphetNetForCausalLM.get_output_embeddings  r÷  r!   c                 ó   • Xl         g r°   r  r  s     r    r  Ú+ProphetNetForCausalLM.set_output_embeddings   r  r!   c                 ó°   • U R                   R                  (       a;  U R                  U R                  R                  R
                  U R                  5        g g r°   )r“   rð  rñ  r‰   rç  ru  r  ri   s    r    rò  Ú"ProphetNetForCausalLM._tie_weights£  s;   € Ø;‰;×*×*Ø×&Ñ& t§¡×'>Ñ'>×'NÑ'NÐPT×P\ÑP\Õ]ð +r!   c                 ó$   • XR                   l        g r°   rS  )rj   rç  s     r    Úset_decoderÚ!ProphetNetForCausalLM.set_decoder§  s   € Ø")‰Õr!   c                 ó.   • U R                   R                  $ r°   rS  ri   s    r    rú  Ú!ProphetNetForCausalLM.get_decoderª  rQ  r!   r¥   rÁ   ra   r´  rŽ  rµ  rZ   r  r  rd  rä   r  r‘  r®   c                 óÄ  • Ub  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUS9nUb  UR                  OUR                  SS u  nnUS   R                  XðR                   R                  US5      nU R                  U5      nUSS2S4   nU R                   R                  S:”  a  USS2SS24   OSnSnU	b  U R                  UU	5      nU(       d+  [        S UU4 5       5      nUb  U4U-   USS -   $ UUSS -   $ [        UUUUR                  UR                  UR                  UR                  UR                  UR                   S9	$ )	aÀ  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetForCausalLM
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits

>>> # Model can also be used with EncoderDecoder framework
>>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
>>> import torch

>>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
>>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
... )

>>> ARTICLE = (
...     "the us state department said wednesday it had received no "
...     "formal word from bolivia that it was expelling the us ambassador there "
...     "but said the charges made against him are `` baseless ."
... )
>>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
>>> labels = tokenizer_dec(
...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
... ).input_ids
>>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

>>> loss = outputs.loss
```N)r¥   rÁ   ra   r´  rŽ  rµ  rZ   r  rd  rä   r  r‘  r%   r   rJ   r   c              3   ó.   #   • U  H  oc  M  Uv •  M     g 7fr°   rn   r—  s     r    rš  Ú0ProphetNetForCausalLM.forward.<locals>.<genexpr>  r   rœ  )	rW   rX   rY   rZ   r   r‚   rƒ   r„   r_   )r“   r  r‰   rç  r¡   rÜ   r0   r  r"  r¢  r†   rZ   r   r‚   rƒ   r„   r_   )rj   r¥   rÁ   ra   r´  rŽ  rµ  rZ   r  r  rd  rä   r  r‘  rX  rñ   r/   r#  r$  rX   rY   rW   r%  s                          r    r¿   ÚProphetNetForCausalLM.forward­  s–  € ðB &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆð —/‘/×)Ñ)ØØ)Ø"7Ø#9ØØ!5Ø+Ø'ØØ/Ø!5Ø#ð *ð 
ˆð :CÑ9N i§o¢oÐTa×TgÑTgÐhjÐijÐTkÑ#ˆ
Oà$ Q™ZŸ_™_¨Z¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÓ0E~¢a¨© eÒ,È4ˆàˆØÑØ×%Ñ% n°fÓ=ˆDæÜÑR¨6°<Ñ*@ÓRÓRˆJØ9=Ñ9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø%×3Ñ3Ø$+×$?Ñ$?Ø"×-Ñ-Ø!(×!9Ñ!9Ø!(×!9Ñ!9ñ
ð 
r!   c                 óÄ  • UR                  U R                  R                  UR                  S5      UR                  S5      5      R	                  U5      n[        U R                  R                  5       H'  nUS:”  a  U R                  (       a    OX$US S 2S S 24'   M)     UR                  SS5      R                  5       n[        R                  R                  UR                  SUR                  S5      5      S[        R                  S9n[        R                  R                  XdR                  S5      SS9nU R                  R                   S:”  aŽ  UR#                  SSS	9* nUR%                  U5      R                  S5      n	X‰   nUR'                  5       nU R                  R                   UR                  S5      -  n
S
U R                  R                   -
  U-  X¨-  -   nU$ r(  r+  r2  s              r    r"  Ú#ProphetNetForCausalLM._compute_loss  r;  r!   c                 ór   • Uc  UR                  UR                  5      nU(       a  US S 2SS 24   nUUUUUS.$ )NrJ   )r¥   rÁ   rŽ  rZ   rd  )Únew_onesr¡   )rj   r¥   rZ   rÁ   rŽ  rd  Úkwargss          r    Úprepare_inputs_for_generationÚ3ProphetNetForCausalLM.prepare_inputs_for_generation8  sL   € ð Ñ!Ø&×/Ñ/°	·±Ó@ˆNæØ!¢! R¡S &Ñ)ˆIð #Ø,Ø"Ø.Ø"ñ
ð 	
r!   c                 óP   ^• SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nrn   c              3   óx   >#   • U  H/  oR                  S TR                  UR                  5      5      v •  M1     g7frA  rB  rD  s     €r    rš  Ú7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>X  s1   øé € ÐnÒcmÐU_×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÐPÒcmùrH  rI  rJ  s    `  r    rM  Ú$ProphetNetForCausalLM._reorder_cacheR  s:   ø€ ð ˆÛ)ˆJØÜÔnÑcmÓnÓnðñ ŠNñ *ð Ðr!   rU  )NNNNNNNNNNNNNrV  )NNNN)ro   rp   rq   rr   r  r   r´   r…  r‹  r  r  rò  ri  rú  r   r   r   r   r   rÿ   r   r†   r¿   r"  rv  rW  rM  rw   rÈ   rÉ   s   @r    rY  rY  {  s¶  ø† òÐðÐ/÷ ò 7ò8òò&ò^ò*ò'ð ð -1Ø15Ø8<Ø9=Ø,0Ø7;Ø@DØ04Ø)-Ø$(Ø,0Ø/3Ø&*ñlà˜EŸL™LÑ)ðlð ! §¡Ñ.ðlð  (¨¯©Ñ5ð	lð
 !)¨¯©Ñ 6ðlð ˜EŸL™LÑ)ðlð ' u§|¡|Ñ4ðlð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðlð   §¡Ñ-ðlð ˜Ÿ™Ñ&ðlð ˜D‘>ðlð $ D™>ðlð ' t™nðlð ˜d‘^ðlð 
ˆuÐ/Ð/Ñ	0ôló ðlô\ð> ØØØô
ð4 ñó ör!   rY  c                   ó@   ^ • \ rS rSrSrS\4U 4S jjrS rS rSr	U =r
$ )r[  i]  zx
This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
classes.
r“   c                 óæ   >• [         TU ]  U5        [        R                  " UR                  UR
                  UR                  S9U l        [        XR                  S9U l	        U R                  5         g )Nrx  r„  )r³   r´   r   r—   ry  rµ   rŸ   ru  rª  rç  r€  r¶   s     €r    r´   Ú!ProphetNetDecoderWrapper.__init__c  sV   ø€ Ü‰Ñ˜Ô ä!Ÿ|š|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÑsˆÔÜ(¨×AUÑAUÑVˆŒð 	‰Õr!   c                 ól   • U R                  U R                  U R                  R                  5       5        g r°   )rñ  ru  rç  r…  ri   s    r    rò  Ú%ProphetNetDecoderWrapper._tie_weightsl  s%   € Ø×"Ñ" 4×#7Ñ#7¸¿¹×9ZÑ9ZÓ9\Õ]r!   c                 ó&   • U R                   " U0 UD6$ r°   rù  )rj   Úargsru  s      r    r¿   Ú ProphetNetDecoderWrapper.forwardo  s   € Ø|Š|˜TÐ, VÑ,Ð,r!   )rç  ru  )ro   rp   rq   rr   rs   r   r´   rò  r¿   rw   rÈ   rÉ   s   @r    r[  r[  ]  s%   ø† ñð
Ð/÷ ò^÷-ð -r!   r[  )rª  rt  rY  r  rÞ  rˆ   rZ  )9rs   râ  r<   rf   Údataclassesr   Útypingr   r   r   r   Útorch.utils.checkpointr   r   Útorch.nnr	   Úactivationsr   Ú
generationr   Úmodeling_outputsr   Úmodeling_utilsr   Úutilsr   r   r   Úconfiguration_prophetnetr   Ú
get_loggerro   r¼  r   r4   rH   rS   rU   ry   r€   r†   rˆ   r—   r­   ÚModulerË   r  r  rK  r]  rt  rª  rÞ  r  rY  r[  Ú__all__rn   r!   r    Ú<module>r‘     sN  ðñ Yã Û Û Ý !ß )Ñ )ã Û ß Ý å !Ý )Ý /Ý -ß 9Ñ 9Ý 6ð 
×	Ò	˜HÓ	%€ôQò7ô" ò6Mð. ôQ% ó Q%ó ðQ%ðh ôR% ;ó R%ó ðR%ðj ô8@ ;ó 8@ó ð8@ðv ô:@ ó :@ó ð:@ðz ô#! ó #!ó ð#!ôL(- R§\¡\ô (-ôV~B˜"Ÿ)™)ô ~BôB˜BŸI™Iô ô.|/ 2§9¡9ô |/ô~	(˜RŸY™Yô (ôVQ˜RŸY™Yô Qñh ðñô
{
Ð1ó {
óð
{
ñ| ðñô
RGÐ1ó RGóð
RGðj
 ôP
Ð/ó P
ó ðP
ñf ðñô
D'Ð)BÀOó D'óð
D'ñN ðñô
ZÐ5°ó Zóð
Zôz-Ð8ô -ò,r!   