ó
    fT–h][ ã                   ó¶  • S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSK
Jr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  \R<                  " \5      r \ " S S\5      5       r!\ " S S\5      5       r"\" SS9 " S S\5      5       r#\ " S S\#5      5       r$\" SS9 " S S\#5      5       r%\" SS9 " S S\#\5      5       r&/ SQr'g) zRAG model implementation.é    N)Ú	dataclass)ÚCallableÚListÚOptionalÚTupleÚUnion)Únné   )ÚPretrainedConfig)ÚGenerationConfigÚGenerationMixinÚLogitsProcessorListÚStoppingCriteriaList)ÚModelOutput)ÚPreTrainedModel)Úauto_docstringÚloggingé   )Ú	RagConfig)ÚRagRetrieverc                   ó  • \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                      \	S	'   Sr\\R                      \	S
'   Sr\\R                      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)ÚRetrievAugLMMarginOutputé$   a  
Base class for retriever augmented marginalized models outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_heads, sequence_length, embed_size_per_head)`).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
NÚlossÚlogitsÚ
doc_scoresÚpast_key_valuesÚretrieved_doc_embedsÚretrieved_doc_idsÚcontext_input_idsÚcontext_attention_maskÚ"question_encoder_last_hidden_state.Úquestion_enc_hidden_statesÚquestion_enc_attentionsÚgenerator_enc_last_hidden_stateÚgenerator_enc_hidden_statesÚgenerator_enc_attentionsÚgenerator_dec_hidden_statesÚgenerator_dec_attentionsÚgenerator_cross_attentions© )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   r   r   Ú
LongTensorr    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   r*   Ú__static_attributes__r+   ó    Ú\/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/rag/modeling_rag.pyr   r   $   sÈ  ‡ ñHðT )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø*.€FˆHU×&Ñ&Ñ'Ó.Ø.2€J˜×*Ñ*Ñ+Ó2Ø9=€OX˜d 5×#4Ñ#4Ñ5Ñ6Ó=Ø8<Ð˜( 5×#4Ñ#4Ñ5Ó<Ø48Ðx × 0Ñ 0Ñ1Ó8Ø48Ðx × 0Ñ 0Ñ1Ó8Ø9=Ð˜H U×%5Ñ%5Ñ6Ó=ØFJÐ&¨°×1BÑ1BÑ(CÓJØJNÐ ¨¨u×/@Ñ/@À#Ð/EÑ)FÑ GÓNØGKÐ˜X e¨E×,=Ñ,=¸sÐ,BÑ&CÑDÓKØCGÐ# X¨e×.?Ñ.?Ñ%@ÓGØKOÐ ¨%°×0AÑ0AÀ3Ð0FÑ*GÑ!HÓOØHLÐ˜h u¨U×->Ñ->ÀÐ-CÑ'DÑEÓLØKOÐ ¨%°×0AÑ0AÀ3Ð0FÑ*GÑ!HÓOØHLÐ˜h u¨U×->Ñ->ÀÐ-CÑ'DÑEÓLØJNÐ ¨¨u×/@Ñ/@À#Ð/EÑ)FÑ GÖNr6   r   c                   óæ  • \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)ÚRetrievAugLMOutputéƒ   aK  
Args:
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_heads, sequence_length, embed_size_per_head)`).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
Nr   r   r   r   r   r    r!   r"   .r#   r$   r%   r&   r'   r(   r)   r*   r+   )r,   r-   r.   r/   r0   r   r   r1   r2   r3   r   r   r   r   r   r4   r    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   r*   r5   r+   r6   r7   r9   r9   ƒ   s³  ‡ ñDðL +/€FˆHU×&Ñ&Ñ'Ó.Ø.2€J˜×*Ñ*Ñ+Ó2Ø9=€OX˜d 5×#4Ñ#4Ñ5Ñ6Ó=Ø8<Ð˜( 5×#4Ñ#4Ñ5Ó<Ø48Ðx × 0Ñ 0Ñ1Ó8Ø48Ðx × 0Ñ 0Ñ1Ó8Ø9=Ð˜H U×%5Ñ%5Ñ6Ó=ØFJÐ&¨°×1BÑ1BÑ(CÓJØJNÐ ¨¨u×/@Ñ/@À#Ð/EÑ)FÑ GÓNØGKÐ˜X e¨E×,=Ñ,=¸sÐ,BÑ&CÑDÓKØCGÐ# X¨e×.?Ñ.?Ñ%@ÓGØKOÐ ¨%°×0AÑ0AÀ3Ð0FÑ*GÑ!HÓOØHLÐ˜h u¨U×->Ñ->ÀÐ-CÑ'DÑEÓLØKOÐ ¨%°×0AÑ0AÀ3Ð0FÑ*GÑ!HÓOØHLÐ˜h u¨U×->Ñ->ÀÐ-CÑ'DÑEÓLØJNÐ ¨¨u×/@Ñ/@À#Ð/EÑ)FÑ GÖNr6   r9   a±  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
    )Úcustom_introc            
       ó^   • \ rS rSr\rSrSrSr\	   SS\
\   S\
\   S\S\4S	 jj5       rS
rg)ÚRagPreTrainedModeléÝ   ÚragTNÚ.question_encoder_pretrained_model_name_or_pathÚ'generator_pretrained_model_name_or_pathÚ	retrieverÚreturnc                 óÌ  • UR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        H	  n	USU	-   	 M     UR                  5        H	  n	USU	-   	 M     UR	                  SS5      n
U
cL  Uc   S5       eSSKJn  SU;  a#  SS	KJn  UR                  " U40 UDS
S0D6u  p×X×S'   UR                  " U40 UD6n
UR	                  SS5      nUcN  Uc   S5       eSSKJ
n  SU;  a%  SS	KJn  UR                  " U40 UDS
S0D6u  nnUUS'   UR                  " U40 UD6nUR                  SS5      nUc,  [        R                  " U
R                  UR                  40 UD6nU " X®UUS9$ s  snnf s  snnf )a+  
Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
model checkpoints.

The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
the model, you need to first set it back in training mode with `model.train()`.

Params:
    question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the question encoder. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
              this case, `from_tf` should be set to `True` and a configuration object should be provided as
              `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
              PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

    generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the generator. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
              this case, `from_tf` should be set to `True` and a configuration object should be provided as
              `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
              PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

    model_args (remaining positional arguments, *optional*):
        All remaining positional arguments will be passed to the underlying model's `__init__` method.
    retriever ([`RagRetriever`], *optional*):
        The retriever to use.
    kwwargs (remaining dictionary of keyword arguments, *optional*):
        Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
        `output_attentions=True`).

        - To update the question_encoder configuration, use the prefix *question_encoder_* for each
          configuration parameter.
        - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
        - To update the parent model configuration, do not use a prefix for each configuration parameter.

        Behaves differently depending on whether a `config` is provided or automatically loaded.

Example:

```python
>>> from transformers import RagModel

>>> # initialize a RAG from two pretrained models.
>>> model = RagModel.from_pretrained_question_encoder_generator(
...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./rag")
>>> # load fine-tuned model
>>> model = RagModel.from_pretrained("./rag")
```Úquestion_encoder_NÚ
generator_ÚmodelznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be definedé   ©Ú	AutoModelÚconfig)Ú
AutoConfigÚreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be defined©ÚAutoModelForSeq2SeqLM)Úquestion_encoderÚ	generatorrK   rB   )ÚitemsÚ
startswithÚlenÚkeysÚpopÚauto.modeling_autorJ   Úauto.configuration_autorL   Úfrom_pretrainedrO   Úgetr   Ú'from_question_encoder_generator_configsrK   )Úclsr@   rA   rB   ÚkwargsÚargumentÚvalueÚkwargs_question_encoderÚkwargs_generatorÚkeyrP   rJ   rL   Úquestion_encoder_configrQ   rO   Úgenerator_configrK   s                     r7   Ú*from_pretrained_question_encoder_generatorÚ=RagPreTrainedModel.from_pretrained_question_encoder_generatorì   sl  € ðL $*§<¡<¤>ô#
â#1‘Ø×"Ñ"Ð#6×7ó 8ˆH”SÐ,Ó-Ð/Ð0°%Ò7Ù#1ð 	 ñ #
ð $*§<¡<¤>ô
â#1‘Ø×"Ñ" <×0ó 1ˆH”S˜Ó&Ð(Ð)¨5Ò0Ù#1ð 	ñ 
ð +×/Ñ/Ö1ˆCØÐ*¨SÑ0Ò1ñ 2à#×(Ñ(Ö*ˆCØ| cÑ)Ò*ñ +ð 3×6Ñ6°wÀÓEÐØÑ#ØAÑMð ðóÐMõ 7àÐ6Ó6Ý@àCM×C]ÒC]ØBñDà-ñDð *.òDÑ@Ð'ð
 5L¨Ñ1à(×8Ò8Ø>ñ ØBYñ Ðð %×(Ñ(¨°$Ó7ˆ	ØÑØ:ÑFð ð!óÐFõ CàÐ/Ó/Ý@à5?×5OÒ5OØ;ñ6Ø?Oñ6Øfjò6Ñ2Ð Ð"2ð .>Ð  Ñ*à-×=Ò=Ø7ñØ;KñˆIð
 —‘˜H dÓ+ˆØ‰>Ü×FÒFØ ×'Ñ'¨×)9Ñ)9ñØ=CñˆFñ Ð$4ÐRXÐdmÑnÐnùóO#
ùó
s   ”G´GÁG Á=G r+   )NNN)r,   r-   r.   r/   r   Úconfig_classÚbase_model_prefixÚ_supports_flash_attn_2Ú_supports_sdpaÚclassmethodr   Ústrr   r   re   r5   r+   r6   r7   r=   r=   Ý   ss   † ð €LØÐØ!ÐØ€Nàð IMØAEØ"&ñ	Joà8@À¹ðJoð 2:¸#±ðJoð  ð	Joð 
ôJoó óJor6   r=   c            "       ó  ^ • \ rS rSr    SS\\   S\\   S\\   S\\   4U 4S jjjr\	              SS\\
R                     S\\
R                     S	\\\\
R                           S
\\
R                     S\\
R                     S\\\\
R                           S\\
R                     S\\
R                     S\\
R                     S\\   S\\   S\\   S\\   S\\   S\\\
R                     \4   4S jj5       rSrU =r$ )ÚRagModeliz  rK   rP   rQ   rB   c                 ób  >• Uc  Ub  Uc   S5       eUc-  [         R                  " UR                  UR                  40 UD6nO1[        XR                  5      (       d   SU SU R                   35       e[
        TU ]  U5        Uc!  SSKJn  UR                  UR                  5      nUc!  SSKJn  UR                  UR                  5      nX@l        U R                  b9  [        U[        5      (       d   S[        U R                  5       S	35       eX@l        X l
        X0l        SU l        S
U l        g)á‘  
question_encoder (`PreTrainedModel`, *optional*):
    The model responsible for encoding the question into hidden states for retrieval.
generator (`PreTrainedModel`, *optional*):
    The model responsible for generating text based on retrieved documents.
retriever (`RagRetriever`, *optional*):
    The component responsible for retrieving documents from a knowledge base given the encoded question.
NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rH   rI   rN   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)r   r[   rK   Ú
isinstancerg   ÚsuperÚ__init__rW   rJ   Úfrom_configrP   rO   rQ   rB   r   ÚtypeÚctx_encoderÚcontext_encoder_training)	ÚselfrK   rP   rQ   rB   r]   rJ   rO   Ú	__class__s	           €r7   rs   ÚRagModel.__init__|  s8  ø€ ð  Ñ!Ð&6Ñ&BÀyÑG\ð 	
Ø_ó	
Ð]ð ‰>Ü×FÒFØ ×'Ñ'¨×)9Ñ)9ñØ=Cñ‰Fô ˜f×&7Ñ&7×8Ñ8Ðs¸HÀVÀHÐL_Ð`d×`qÑ`qÐ_rÐ:sÓsÐ8Ü‰Ñ˜Ô ØÑ#Ý6à(×4Ñ4°V×5LÑ5LÓMÐàÑÝBà-×9Ñ9¸&×:JÑ:JÓKˆIà"ŒØ>‰>Ñ%Ü˜i¬×6Ñ6ð Ø.¬t°D·N±NÓ/CÐ.DÐDjÐkóÐ6ð 'ŒNà 0ÔØ"ŒàˆÔØ(-ˆÕ%r6   Ú	input_idsÚattention_maskÚencoder_outputsÚdecoder_input_idsÚdecoder_attention_maskr   r   r    r!   Ú	use_cacheÚoutput_attentionsÚoutput_hidden_statesÚoutput_retrievedÚn_docsrC   c                 ób  • Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  SL=(       a%    USL =(       d    U	SL =(       d    USL =(       a    USL nUGc*  U(       Ga  U R                  XSS9nUS   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                   R                  USS9nU R                  (       aæ  US	   US
   US   US   US   US   4u  nn	nnnnUR                  U5      nU	R                  U5      n	UR                  U5      nUR                  U5      nU R!                  UUSS9R"                  nUR%                  SUUR&                  S   5      n[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nO­US	   US
   US   US   4u  p‰nnUR                  U5      nUR                  U5      nU	R                  U5      n	[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nOUc   S5       eU	c   S5       eUc   S5       eUc   S5       eUR&                  S   U-  S:X  d   SU SUR&                  S    S35       eUb  UR1                  USS9nUb  UR1                  USS9nU R                  UU	UUUUU
USS9	nU(       d  SnSnSnSnSnOWR2                  nUR4                  nU(       a  U(       d  SnSn	SnSn[7        S)0 SUR8                  _SU_SUR:                  _S	U_S
U	_SW_SW_S W_S!U_S"U_S#UR<                  _S$UR>                  _S%UR@                  _S&URB                  _S'URD                  _S(URF                  _6$ )*aù  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"])
```NT)r|   Úreturn_dictr   Úcpu©ÚdeviceÚdtypeÚpt©Úprefixr„   Úreturn_tensorsr    r!   r   Útokenized_doc_idsÚtokenized_doc_attention_maskÚdoc_idséÿÿÿÿr   rH   z˜Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z‘Make sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.úM The first dimension of `context_input_ids` should be a multiple of `n_docs`=ú	, but is Ú.©Údim)	r{   r|   r}   r~   r   r   r€   r   r†   ©Nr   r   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )$rK   r„   r€   r   r‚   rƒ   rB   rP   ÚdetachÚtor1   Úfloat32ÚnumpyrQ   r   rw   rv   Úpooler_outputÚviewÚshapeÚbmmÚ	unsqueezeÚ	transposeÚsqueezeÚrepeat_interleaveÚhidden_statesÚ
attentionsr9   r   r   Úencoder_last_hidden_stateÚencoder_hidden_statesÚencoder_attentionsÚdecoder_hidden_statesÚdecoder_attentionsÚcross_attentions)rx   r{   r|   r}   r~   r   r   r   r    r!   r€   r   r‚   rƒ   r„   Úhas_to_retrieveÚquestion_enc_outputsr"   Úretriever_outputsr   Úretrieved_doc_input_idsÚretrieved_doc_attention_maskr   Úgen_outputsr#   r$   s                             r7   ÚforwardÚRagModel.forward®  s…  € ðR "Ñ-‘°4·;±;×3EÑ3EˆØ!*Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð 0@Ñ/KÑ+ÐQU×Q\ÑQ\×QmÑQmÐð N‰N $Ð&÷ (Ø" dÐ*×bÐ.DÈÐ.L×bÐPZÐ^bÐPb÷(à 4Ð'ð 	ð Ò"ßØ'+×'<Ñ'<ØÈ$ð (=ð (Ð$ð 6JÈ!Ñ5LÐ2à$(§N¡NØØ6×=Ñ=Ó?×BÑBÈ%ÔW\×WdÑWdÐBÐe×kÑkÓmØŸ>™>×0Ñ0×7Ñ7Ø!Ø#'ð %3ð %Ð!ð ×0×0ð *Ð*=Ñ>Ø)Ð*BÑCØ)Ð*@ÑAØ)Ð*=Ñ>Ø)Ð*HÑIØ)¨)Ñ4ðñØ)Ø.Ø,Ø/Ø4Ø)ð ):×(<Ñ(<¸YÓ(GÐ%Ø-C×-FÑ-FÀyÓ-QÐ*à.E×.HÑ.HÈÓ.SÐ+Ø3O×3RÑ3RÐS\Ó3]Ð0Ø+/×+;Ñ+;Ø/Ð@\Ðjnð ,<ð ,ç#‘mð )ð ,@×+DÑ+DØ˜FÐ$F×$LÑ$LÈQÑ$Oó,Ð(ô
 "'§¢Ø:×DÑDÀQÓGÐI]×IgÑIgÐhiÐklÓImó"ç‘g˜a“jñ ð *Ð*=Ñ>Ø)Ð*BÑCØ)Ð*@ÑAØ)¨)Ñ4ð	jÑfÐ%Ð?SÐUfð ,@×+BÑ+BÐCeÓ+fÐ(Ø(9×(<Ñ(<¸YÓ(GÐ%Ø-C×-FÑ-FÀyÓ-QÐ*ô "'§¢Ø:×DÑDÀQÓGÐI]×IgÑIgÐhiÐklÓImó"ç‘g˜a“jñ ð )Ñ4ð ðPóÐ4ð .Ñ9ð ðTóÐ9ð "Ñ-ð ðJóÐ-ð
 Ñ%ð 	
Øló	
Ð%ð × Ñ  Ñ# fÑ,°Ó2ð 	
Ø[Ð\bÐ[cð dØ!×'Ñ'¨Ñ*Ð+¨1ð.ó	
Ð2ð Ñ(Ø 1× CÑ CÀFÐPQÐ CÐ RÐà!Ñ-Ø%;×%MÑ%MÈfÐZ[Ð%MÐ%\Ð"à—n‘nØ'Ø1Ø+Ø/Ø#9Ø+ØØ/Øð %ð 

ˆö Ø15Ð.Ø)-Ð&Ø&*Ð#Ø#'Ð Ø $Ñà)=×)KÑ)KÐ&Ø&:×&EÑ&EÐ#æÖ&6à 'ÐØ%)Ð"Ø#'Ð Ø $Ðä!ò 
Ø×%Ò%ð
á!ð
ð (×7Ò7ð
ñ 0ð	
ñ
 $:ð
ñ "6ð
ñ 0ð
ñ 0Rð
ñ (Bð
ñ %<ð
ð -8×,QÒ,Qð
ð )4×(IÒ(Ið
ð &1×%CÒ%Cð
ð )4×(IÒ(Ið
ð &1×%CÒ%Cð
ð  (3×'CÒ'Cð!
ð 	
r6   )rw   rv   rQ   rP   rB   ©NNNN)NNNNNNNNNNNNNN)r,   r-   r.   r/   r   r   r   r   rs   r   r1   r4   ÚTensorr   r2   Ú
BoolTensorÚboolÚintr   r9   r³   r5   Ú__classcell__©ry   s   @r7   rn   rn   z  sÙ  ø† ð .2Ø6:Ø/3Ø,0ñ0.àÐ)Ñ*ð0.ð # ?Ñ3ð0.ð ˜OÑ,ð	0.ð
 ˜LÑ)÷0.ð 0.ðd ð 15Ø15ØEIØ8<Ø=AØEIØ26Ø8<Ø=AØ$(Ø,0Ø/3Ø+/Ø $ñd
à˜E×,Ñ,Ñ-ðd
ð ! §¡Ñ.ðd
ð " %¨¨e×.?Ñ.?Ñ(@Ñ"AÑBð	d
ð
 $ E×$4Ñ$4Ñ5ðd
ð !)¨×)9Ñ)9Ñ :ðd
ð " %¨¨e×.?Ñ.?Ñ(@Ñ"AÑBðd
ð ˜U×.Ñ.Ñ/ðd
ð $ E×$4Ñ$4Ñ5ðd
ð !)¨×)9Ñ)9Ñ :ðd
ð ˜D‘>ðd
ð $ D™>ðd
ð ' t™nðd
ð # 4™.ðd
ð ˜‘ðd
ð  
ˆuU—\‘\Ñ"Ð$6Ð6Ñ	7ô!d
ó öd
r6   rn   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c            &       ó®  ^ • \ rS rSr    S'S\\   S\\   S\\   S\\   4U 4S jjjrS\4S jr	S\4S	 jr
\                 S(S
\\R                     S\\R                     S\\\\R                           S\\R                     S\\R                      S\\\\R                           S\\R                     S\\R                     S\\R"                     S\\   S\\   S\\   S\\   S\\   S\\   S\\R                     S\\   S\4$S jj5       r\S 5       r\S 5       r\S 5       r\R4                  " 5                S)S
\\R                     S\\R                     S\\R                     S\\R                     S\\R"                     S \\   S!\\   S"\\   S\\   S\R                  4S# jj5       r S*S$ jr\S% 5       rS&rU =r $ )+ÚRagSequenceForGenerationi–  rK   rP   rQ   rB   c                 óÀ   >• Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        g©rp   NzHEither a configuration or an encoder and a generator has to be provided.)rK   rP   rQ   rB   ©r   r[   rK   rr   rs   rn   r?   ©rx   rK   rP   rQ   rB   r]   ry   s         €r7   rs   Ú!RagSequenceForGeneration.__init__œ  sv   ø€ ð  Ñ!Ð&6Ñ&BÀyÑG\ð 	
ØVó	
Ð]ð ‰>Ü×FÒFØ ×'Ñ'¨×)9Ñ)9ñØ=CñˆFô 	‰Ñ˜Ô ô  6ÐXaÑwˆr6   c                 ó$   • XR                   l        g r˜   ©r?   rB   ©rx   rB   s     r7   Úset_retrieverÚ&RagSequenceForGeneration.set_retriever¹  ó   € Ø&‰Õr6   rv   c                 óF   • SU R                   l        XR                   l        g ©NT©r?   rw   rv   ©rx   rv   s     r7   Ú set_context_encoder_for_trainingÚ9RagSequenceForGeneration.set_context_encoder_for_training¼  ó   € Ø,0ˆ‰Ô)Ø*‰Õr6   r{   r|   r}   r~   r   r   r    r!   r   r€   r   r‚   rƒ   Úexclude_bos_scoreÚreduce_lossÚlabelsr„   rC   c                 ó,  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUb=  U R                  UR                  UR                  UUU R                   R                  UUS9n[        S0 SU_SUR                  _SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _6$ )aë  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
exclude_bos_score (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
    the loss.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )
```NF©r{   r|   r}   r~   r   r    r!   r   r   r€   r   r‚   rƒ   r„   )rÑ   ÚepsilonrÐ   r„   r   r   r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )rK   r„   rÐ   rÑ   r?   Úget_nllr   r   Úlabel_smoothingr   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   )rx   r{   r|   r}   r~   r   r   r    r!   r   r€   r   r‚   rƒ   rÐ   rÑ   rÒ   r„   r]   Úoutputsr   s                        r7   r³   Ú RagSequenceForGeneration.forwardÀ  sü  € ðN "Ñ-‘°4·;±;×3EÑ3EˆØ1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐØ%0Ñ%<‘kÀ$Ç+Á+×BYÑBYˆàÑØ Ñ(Ø$*Ð!ØˆIà—(‘(ØØ)Ø+Ø/Ø#9Ø/Ø#9Ø!Ø+ØØ/Ø!5Ø-Øð ð 
ˆð" ˆØÑØ—<‘<Ø—‘Ø×"Ñ"Ø!Ø'ØŸ™×3Ñ3Ø"3Øð  ð ˆDô (ò 
Ùð
à—>’>ð
ð ×)Ò)ð
ð $×3Ò3ð	
ð
 &×7Ò7ð
ð $+×#AÒ#Að
ð ")×!=Ò!=ð
ð &×7Ò7ð
ð 07×/YÒ/Yð
ð (/×'IÒ'Ið
ð %,×$CÒ$Cð
ð -4×,SÒ,Sð
ð )0×(KÒ(Kð
ð &-×%EÒ%Eð
ð )0×(KÒ(Kð
ð  &-×%EÒ%Eð!
ð" (/×'IÒ'Ið#
ð 	
r6   c                 ó.   • U R                   R                  $ r˜   rÄ   ©rx   s    r7   rB   Ú"RagSequenceForGeneration.retrievera  ó   € àx‰x×!Ñ!Ð!r6   c                 ó.   • U R                   R                  $ r˜   ©r?   rQ   rÛ   s    r7   rQ   Ú"RagSequenceForGeneration.generatore  rÝ   r6   c                 ó.   • U R                   R                  $ r˜   ©r?   rP   rÛ   s    r7   rP   Ú)RagSequenceForGeneration.question_encoderi  ó   € àx‰x×(Ñ(Ð(r6   Údo_deduplicationÚnum_return_sequencesÚ	num_beamsc
                 óÊ  • U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc
  Uc   S5       eU R
                  b’  Uc  U R                  XS9S   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                   R                  U	SS9S	   nUR                  U5      n/ nXŠS
'   XŠS'   SU
S'   Ub  UR                  S   OUR                  S   U	-  n[        U5       GHW  nX?U	-  US-   U	-   nU R                  R                   " U40 U
D6nU(       aV  [        R"                  " [%        U Vs0 s H  n['        UR)                  5       5      U_M     snR+                  5       5      5      nUR                  S   nUb   XUS-    R-                  US5      nU " UUSS9nOnUc   S5       eUc   S5       eUR-                  US5      nXOU	-  US-   U	-   nUR-                  US5      nX_US-   2SS24   nUR-                  US5      nU " UUUUSS9nUS   * R/                  U5      S   nUR1                  UU   5        GMZ     U R3                  XÐR                   R                  R4                  S9$ s  snf )a¸  
Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
for more information on how to set other generate input parameters.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
        retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
        `context_attention_mask` have to be provided to the forward pass. They are returned by
        [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
        provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
    do_deduplication (`bool`, *optional*):
        Whether or not to deduplicate the generations from different context documents for a given input. Has
        to be set to `False` if used while training with distributed backend.
    num_return_sequences(`int`, *optional*, defaults to 1):
        The number of independently computed returned sequences for each element in the batch. Note that this
        is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
        where we set `num_return_sequences` to `num_beams`.
    num_beams (`int`, *optional*, defaults to 1):
        Number of beams for beam search. 1 means no beam search.
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    kwargs (`Dict[str, Any]`, *optional*):
        Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
Nz= At least one of input_ids or context_input_ids must be given©r|   r   r‡   rˆ   r‹   rŒ   r    rç   ræ   r|   r   T)rÒ   rÐ   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z‘Make sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r    r!   r   rÒ   rÐ   r   )Úpad_token_id)rK   r„   rå   ræ   rç   rB   rP   r™   rš   r1   r›   rœ   rQ   r   rŸ   ÚrangeÚgenerateÚstackÚlistrl   ÚtolistÚvaluesÚrepeatÚtopkÚappendÚ_cat_and_padrê   )rx   r{   r|   r    r!   r   rå   ræ   rç   r„   Úmodel_kwargsÚnum_doc_return_sequencesÚquestion_hidden_statesÚhyposÚ
batch_sizeÚindexÚgenerator_input_idsÚoutput_sequencesÚkÚnum_candidatesÚnew_input_idsrØ   Úindividual_input_idsÚindividual_attention_maskÚindividual_doc_scoresÚtop_cand_indss                             r7   rì   Ú!RagSequenceForGeneration.generatem  sf  € ðB "Ñ-‘°4·;±;×3EÑ3EˆØ/?Ñ/KÑ+ÐQU×Q\ÑQ\×QmÑQmÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	!ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	àÑ$Ð(9Ñ(Eð 	
ØKó	
ÐEð >‰>Ñ%Ð*;Ñ*CØ%)×%:Ñ%:¸9Ð%:Ð%dÐefÑ%gÐ"Ø $§¡ØØ&×-Ñ-Ó/×2Ñ2¸%ÄuÇ}Á}Ð2ÐU×[Ñ[Ó]Ø—~‘~×,Ñ,×3Ñ3ØØ#ð !/ð !ð "ñ!#Ðð !2× 4Ñ 4°YÓ ?ÐàˆØ$-[Ñ!Ø/8Ð+Ñ,Ø)-ˆÐ%Ñ&à+4Ñ+@Y—_‘_ QÒ'ÐFW×F]ÑF]Ð^_ÑF`ÐdjÑFjˆ
ä˜:×&ˆEà"3¸F±NÀeÈaÁiÐSYÑEYÐ"ZÐà#Ÿ~™~×6Ò6Ø#ñ àñ Ðö  ä#(§;¢;¬tÑQaÓ4bÒQaÈA´S¸¿¹»³_ÀaÒ5GÑQaÑ4b×4iÑ4iÓ4kÓ/lÓ#mÐ à-×3Ñ3ØñˆNð
 Ñ$Ø )°%¸!±)Ð <× CÑ CÀNÐTUÓ VÙ˜}Ð5EÐY]Ñ^‘à-Ñ9ð ðTóÐ9ð "Ñ-ð ðJóÐ-ð
 (;×'AÑ'AØ" Aó(Ð$ð -CÈ6Á>ÐUZÐ]^ÑU^ÐbhÑThÐ,iÐ)Ø,E×,LÑ,LÈ^Ð]^Ó,_Ð)à(2¸EÀA¹IÐ3FÊÐ3IÑ(JÐ%Ø(=×(DÑ(DÀ^ÐUVÓ(WÐ%áØ&:Ø+DØ4Ø+Ø&*ñð & f™oÐ-×3Ñ3Ð4LÓMÈaÑPˆMð L‰LÐ)¨-Ñ8×9ñg 'ðj × Ñ  ·[±[×5JÑ5J×5WÑ5WÐ ÐXÐXùòW 5cs   Æ%$K c                 ó2  ^ ^• [         R                  " TS S 2SS 24   TR                  TR                  S   S5      R	                  T R
                  R                  R                  5      /S5      mUb  UOT R
                  R                  nT R
                  R                  =(       d     T R
                  R                  R                  nUS L=(       a&    TS S 2S4   R                  U5      R                  5       n	U U4S jn
[        R                  R                  USS9R                  UR                  S   U-  USUR!                  S5      5      n[        R                  R                  USS9R#                  S5      R#                  S5      nUS S 2S S 2S S2S S 24   nUS S 2S S 2SS2S S 24   nUS S 2S S 2SS 2S S 24   n[         R                  " XÞU-   U/SS9nTR#                  S5      R#                  S5      R%                  SUSS5      mTR'                  5       UR'                  5       :X  d   eUR)                  STS9nUR+                  SSS	9nU
" UU5      u  nnU(       a$  U	(       a  US S 2S S 2SS 24   R+                  S5      OUR+                  S5      nUR+                  S5      nUR-                  S5      nUR-                  S5      nU* nU* nU(       a   UR+                  5       nUR+                  5       nUUR!                  S5      -  nS
U-
  U-  UU-  -   nU$ )Nr   r   c                 ó  >• TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ ©Nç        r’   ©ÚeqrK   rQ   rê   ÚanyÚmasked_fill_r£   ©ÚllÚ
smooth_objÚpad_maskrx   Útargets      €€r7   Ú
_mask_padsÚ4RagSequenceForGeneration.get_nll.<locals>._mask_pads  óh   ø€ Ø—y‘y §¡×!6Ñ!6×!CÑ!CÓDˆHØ|‰|~‰~Ø—‘ ¨#Ô.Ø×'Ñ'¨°#Ô6Ø—:‘:˜b“> :×#5Ñ#5°bÓ#9Ð9Ð9r6   r’   r–   rH   ©r—   rú   T©r—   Úkeepdimç      ð?)r1   ÚcatÚnewrŸ   Úfill_rK   rQ   rê   r„   Úbos_token_idr
  Úallr	   Ú
functionalÚlog_softmaxrž   Úsizer¡   rñ   r—   ÚgatherÚsumÚ	logsumexp)rx   Ú
seq_logitsr   r  rÑ   rÕ   rÐ   r„   r  Úuse_bosr  Úseq_logprobsÚdoc_logprobsÚfirst_token_scoresÚsecond_token_scoresÚ	remainderÚrag_logprobsr  r  Únll_lossÚsmooth_lossÚeps_ir   s   `  `                   r7   rÖ   Ú RagSequenceForGeneration.get_nll  sÓ  ù€ ô —’Ø’Aq‘rE‰]˜FŸJ™J v§|¡|°A¡¸Ó:×@Ñ@ÀÇÁ×AVÑAV×AcÑAcÓdÐeÐghó
ˆð "Ñ-‘°4·;±;×3EÑ3Eˆð —{‘{×/Ñ/×U°4·;±;×3HÑ3H×3UÑ3UˆØ dÐ*×R¨v²a¸°d©|¯©¸|Ó/L×/PÑ/PÓ/Rˆö	:ô —}‘}×0Ñ0°ÀÐ0ÐD×IÑIØ×Ñ˜QÑ 6Ñ)¨6°2°z·±ÀrÓ7Jó
ˆô —}‘}×0Ñ0°ÀÐ0ÐC×MÑMÈbÓQ×[Ñ[Ð\^Ó_ˆð *ª!ªQ°°°²A¨+Ñ6ÐØ*ª1ªa°°1°²a¨<Ñ8ÐØ ¢¢A q¡rª1 Ñ-ˆ	Ü—y’yÐ"4ÈLÑ6XÐZcÐ!dÐjkÑlˆð ×!Ñ! !Ó$×.Ñ.¨rÓ2×9Ñ9¸!¸VÀQÈÓJˆØz‰z‹|˜|×/Ñ/Ó1Ó1Ð1Ð1à× Ñ  R¨vÐ Ð6ˆØ!×%Ñ%¨"°dÐ%Ð;ˆ
á# B¨
Ó3‰ˆˆJö %6¾'ˆR’’1a‘b‰\×Ñ˜aÔ ÀrÇvÁvÈaÃyˆØ—^‘^ AÓ&ˆ
Ø\‰\˜!‹_ˆØ×)Ñ)¨!Ó,ˆ
à3ˆØ!kˆæØ—|‘|“~ˆHØ%Ÿ/™/Ó+ˆKà˜,×+Ñ+¨BÓ/Ñ/ˆØg‘ Ñ)¨E°KÑ,?Ñ?ˆØˆr6   c           
      ó~  • U S   R                  [        U  Vs/ s H  o"R                  S   PM     sn5      [        U  Vs/ s H  o"R                  S   PM     sn5      5      R	                  U5      nSnU  H:  nX#XDUR                  S   -   2S UR                  S   24'   XBR                  S   -  nM<     U$ s  snf s  snf )Nr   r   )r  r"  rŸ   Úmaxr  )Útensorsrê   ÚtÚoutputÚinds        r7   rô   Ú%RagSequenceForGeneration._cat_and_pad?  s¸   € ð A‰JN‰Nœ3±GÓ<²G¨q§¡¨¤
±GÑ<Ó=¼sÑX_ÓC`ÒX_ÐSTÇGÁGÈAÄJÑX_ÑC`Ó?aÓb×hÑhÐiuÓvð 	ð ˆÛˆAØ;<3˜qŸw™w q™zÑ)Ð)¨<¨Q¯W©W°Q©Z¨<Ð7Ñ8Ø—7‘7˜1‘:ÑŠCñ ð ˆùò  =ùÒC`s
   ˜B5¿B:©r?   rµ   ©NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr  FN)!r,   r-   r.   r/   r   r   r   r   rs   rÆ   rÍ   r   r1   r4   r¶   r   r·   r2   r¸   r¹   r   r³   ÚpropertyrB   rQ   rP   Úno_gradrì   rÖ   Ústaticmethodrô   r5   rº   r»   s   @r7   r½   r½   –  sx  ø† ð .2Ø6:Ø/3Ø,0ñxàÐ)Ñ*ðxð # ?Ñ3ðxð ˜OÑ,ð	xð
 ˜LÑ)÷xð xð:' |ô 'ð+¸Oô +ð ð 15Ø15Ø@DØ8<Ø=AØ@DØ8<Ø=AØ26Ø$(Ø,0Ø/3Ø+/Ø,0Ø&*Ø-1Ø $ñ%^
à˜E×,Ñ,Ñ-ð^
ð ! §¡Ñ.ð^
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ð	^
ð
 $ E×$4Ñ$4Ñ5ð^
ð !)¨×)9Ñ)9Ñ :ð^
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ð^
ð $ E×$4Ñ$4Ñ5ð^
ð !)¨×)9Ñ)9Ñ :ð^
ð ˜U×.Ñ.Ñ/ð^
ð ˜D‘>ð^
ð $ D™>ð^
ð ' t™nð^
ð # 4™.ð^
ð $ D™>ð^
ð  ˜d‘^ð!^
ð" ˜×)Ñ)Ñ*ð#^
ð$ ˜‘ð%^
ð( 
"ô)^
ó ð^
ð@ ñ"ó ð"ð ñ"ó ð"ð ñ)ó ð)ð ‡]‚]ƒ_ð 15Ø59Ø8<Ø=AØ26Ø+/Ø.2Ø#'Ø $ñTYà˜E×,Ñ,Ñ-ðTYð ! ×!1Ñ!1Ñ2ðTYð $ E×$4Ñ$4Ñ5ð	TYð
 !)¨×)9Ñ)9Ñ :ðTYð ˜U×.Ñ.Ñ/ðTYð # 4™.ðTYð ' s™mðTYð ˜C‘=ðTYð ˜‘ðTYð 
×	Ñ	ôTYó ðTYðn osô9ðv ñó ör6   r½   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c            &       ó0  ^ • \ rS rSr    S/S\\   S\\   S\\   S\\   4U 4S jjjrS\4S jr	S	\4S
 jr
      S0S jr\S 5       r\S 5       r\S 5       r\S 5       rS1S jr\                 S2S\\R*                     S\\R,                     S\\\\R0                           S\\R*                     S\\R2                     S\\\\R0                           S\\R*                     S\\R*                     S\\R,                     S\\   S\\   S\\   S\\   S\\   S\\   S \\R*                     S!\\   S"\4$S# jj5       r\R<                  " 5       SSSSSSSS\" 5       \ " 5       4
S\\R*                     S\\R*                     S\\R*                     S\\R*                     S\\R,                     S!\\   S$\\!   S%\\"\\R0                  /\#\   4      S&\\   S'\\    S"\R*                  4S( jj5       r$S) r%S* r&S+ r'S1S, jr(S3S- jr)S.r*U =r+$ )4ÚRagTokenForGenerationiK  NrK   rP   rQ   rB   c                 óÀ   >• Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        gr¿   rÀ   rÁ   s         €r7   rs   ÚRagTokenForGeneration.__init__Q  sv   ø€ ð  Ñ!Ð&6Ñ&BÀyÑG\ð 	
ØVó	
Ð]ð ‰>Ü×FÒFØ ×'Ñ'¨×)9Ñ)9ñØ=CñˆFô 	‰Ñ˜Ô ô  6ÐXaÑwˆr6   c                 ó$   • XR                   l        g r˜   rÄ   rÅ   s     r7   rÆ   Ú#RagTokenForGeneration.set_retrievero  rÈ   r6   rv   c                 óF   • SU R                   l        XR                   l        g rÊ   rË   rÌ   s     r7   rÍ   Ú6RagTokenForGeneration.set_context_encoder_for_trainingr  rÏ   r6   c           
      ó6   • Ub  US S 2SS 24   nS UUUUUUSUS.	$ )Nr’   T)	r{   r}   r   r!   r~   r   r€   Údo_marginalizer„   r+   )	rx   r~   r   r|   r€   r}   r   r„   r]   s	            r7   Úprepare_inputs_for_generationÚ3RagTokenForGeneration.prepare_inputs_for_generationv  sB   € ð Ñ&à 1²!°R±S°&Ñ 9Ðð Ø.Ø$Ø&4Ø!2Ø.Ø"Ø"Øñ

ð 
	
r6   c                 ó.   • U R                   R                  $ r˜   rÄ   rÛ   s    r7   rB   ÚRagTokenForGeneration.retriever“  rÝ   r6   c                 ó.   • U R                   R                  $ r˜   rß   rÛ   s    r7   rQ   ÚRagTokenForGeneration.generator—  rÝ   r6   c                 ó.   • U R                   R                  $ r˜   râ   rÛ   s    r7   rP   Ú&RagTokenForGeneration.question_encoder›  rä   r6   c                 óZ   ^^• S mSnU  H  nU[        UU4S jU 5       5      4-  nM     U$ )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                 óê   • U R                   S   UR                   S   -  nU R                  " SU/U R                   SS  Q76 n U R                  SU5      n U R                  " S/U R                   SS  Q76 nU$ )Nr   r’   r   rH   )rŸ   rž   Úindex_select)r¥   Ú	new_orderr„   Úresults       r7   Ú_reorder_stackedÚ>RagTokenForGeneration._reorder_cache.<locals>._reorder_stacked£  s   € Ø"×(Ñ(¨Ñ+¨y¯©¸qÑ/AÑAˆFØ)×.Ò.¨r°6ÐT¸M×<OÑ<OÐPQÐPRÐ<SÒTˆMØ)×6Ñ6°q¸)ÓDˆMØ"×'Ò'¨ÐE¨]×-@Ñ-@ÀÀÐ-DÒEˆFØˆMr6   r+   c              3   óh   >#   • U  H'  nT" UTR                  UR                  5      5      v •  M)     g 7fr˜   )rš   r‰   )Ú.0Ú
past_staterS  Úbeam_idxs     €€r7   Ú	<genexpr>Ú7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>®  s.   øé € ÐpÒeoÐWaÑ& z°8·;±;¸z×?PÑ?PÓ3Q×RÐRÒeoùs   ƒ/2)Útuple)r   rX  Úreordered_pastÚ
layer_pastrS  s    `  @r7   Ú_reorder_cacheÚ$RagTokenForGeneration._reorder_cacheŸ  s?   ù€ ò	ð ˆÛ)ˆJàÜÕpÑeoÓpÓpðñ ŠNñ *ð Ðr6   c                 ón  • Ub  UOU R                   R                  n[        R                  R	                  USS9R                  UR                  S   U-  USUR                  S5      5      n[        R                  " USS9nXER                  S5      R                  S5      -   n[        R                  " USS9$ )Nr’   r–   r   r   )rK   r„   r	   r  r  rž   rŸ   r   r1   r¡   r#  )rx   r$  r   r„   r&  r'  Úlog_prob_sums          r7   ÚmarginalizeÚ!RagTokenForGeneration.marginalize³  s£   € Ø!Ñ-‘°4·;±;×3EÑ3Eˆô —}‘}×0Ñ0°ÀÐ0ÐD×IÑIØ×Ñ˜QÑ 6Ñ)¨6°2°z·±ÀrÓ7Jó
ˆô ×(Ò(¨¸Ñ;ˆØ#×&<Ñ&<¸RÓ&@×&JÑ&JÈ2Ó&NÑNˆÜŠ˜|°Ñ3Ð3r6   r{   r|   r}   r~   r   r   r    r!   r   r€   r   r‚   rƒ   rE  rÑ   rÒ   r„   rC   c                 ó€  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUR
                  nUbA  Uc   eU R                  UR
                  UR                  UUU R                   R                  US9nU(       a  U R                  UUR                  U5      n[        S0 SU_SU_SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _SUR0                  _6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
do_marginalize (`bool`, *optional*):
    If `True`, the logits are marginalized over all documents by making use of
    `torch.nn.functional.log_softmax`.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )

>>> # or directly generate
>>> generated = model.generate(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
... )
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
```NFrÔ   )rÑ   rÕ   r„   r   r   r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )rK   r„   rE  rÑ   r?   r   rÖ   r   r×   rb  r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   )rx   r{   r|   r}   r~   r   r   r    r!   r   r€   r   r‚   rƒ   rE  rÑ   rÒ   r„   r]   rØ   r   r   s                         r7   r³   ÚRagTokenForGeneration.forward¾  s"  € ð^ "Ñ-‘°4·;±;×3EÑ3EˆØ+9Ñ+E™È4Ï;É;×KeÑKeˆØ%0Ñ%<‘kÀ$Ç+Á+×BYÑBYˆàÑØ Ñ(Ø$*Ð!ØˆIà—(‘(ØØ)Ø+Ø/Ø#9Ø/Ø#9Ø!Ø+ØØ/Ø!5Ø-Øð ð 
ˆð" ˆØ—‘ˆØÑØ$Ñ0Ð0Ð0Ø—<‘<Ø—‘Ø×"Ñ"ØØ'ØŸ™×3Ñ3Øð  ð ˆDö Ø×%Ñ% f¨g×.@Ñ.@À&ÓIˆFä'ò 
Ùð
áð
ð ×)Ò)ð
ð $×3Ò3ð	
ð
 &×7Ò7ð
ð $+×#AÒ#Að
ð ")×!=Ò!=ð
ð &×7Ò7ð
ð 07×/YÒ/Yð
ð (/×'IÒ'Ið
ð %,×$CÒ$Cð
ð -4×,SÒ,Sð
ð )0×(KÒ(Kð
ð &-×%EÒ%Eð
ð )0×(KÒ(Kð
ð  &-×%EÒ%Eð!
ð" (/×'IÒ'Ið#
ð 	
r6   Úgeneration_configÚprefix_allowed_tokens_fnÚlogits_processorÚstopping_criteriac           	      óV  ^^• Uc  U R                   n[        R                  " U5      nUR                  " S%0 UD6nUR	                  SS5      SLnU R                  X}5        Tb  TOU R                  R                  mU R                  Gb  UGc  U R                  XS9S   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                  R                   TSS9nUS	   US
   US   npCUR                  U5      nUR                  U5      nUR                  U5      n[        R"                  " UR%                  S5      UR'                  SS5      5      R)                  S5      nUR*                  S   T-  S:X  d   ST SUR*                  S    S35       eUR*                  S   T-  mU R,                  R                  R/                  5       nU" X4SS9n[        R0                  " TUR2                  -  S4UR4                  [        R6                  [9        U R;                  5       5      R<                  S9nUR*                  S   nUS   nS&UU4S jjnU" XGR2                  S9nU" UUR2                  S9US'   UR?                  UR2                  SS9nX\S'   UUS'   XLS'   TUS'   U RA                  UUUUU	UR<                  S9nU RC                  XzS9nUR2                  S:X  aB  URD                  S:”  a  [G        SURD                   S35      eU RH                  " U4UUUS SS!.UD6$ UR2                  S:”  a=  URD                  UR2                  :”  a  [G        S"5      eU RJ                  " U4UUUS S#.UD6$ [G        S$UR2                   35      e)'a	  
Implements RAG token decoding.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    generation_config (`~generation.GenerationConfig`, *optional*):
        The generation configuration to be used as base parametrization for the generation call. `**kwargs`
        passed to generate matching the attributes of `generation_config` will override them. If
        `generation_config` is not provided, the default will be used, which has the following loading
        priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
        configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
        default values, whose documentation should be checked to parameterize generation.
    prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
        If provided, this function constraints the beam search to allowed tokens only at each step. If not
        provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
        `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
        the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
        constrained generation conditioned on the prefix, as described in [Autoregressive Entity
        Retrieval](https://arxiv.org/abs/2010.00904).
    logits_processor (`LogitsProcessorList`, *optional*):
        Custom logits processors that complement the default logits processors built from arguments and a
        model's config. If a logit processor is passed that is already created with the arguments or a model's
        config an error is thrown.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        Custom stopping criteria that complement the default stopping criteria built from arguments and a
        model's config. If a stopping criteria is passed that is already created with the arguments or a
        model's config an error is thrown.
    kwargs (`Dict[str, Any]`, *optional*):
        Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
        forwarded to the `forward` function of the model.

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
Nr|   ré   r   r‡   rˆ   r‹   rŒ   r    r!   r   r   rH   r“   r”   r•   T)r{   r|   r†   )rŠ   r‰   r’   Úlast_hidden_statec                 óô   >• U S S S S 24   R                  TST4U R                  SS  -   5      n U R                  TUT4U R                  SS  -   5      n U R                  TU-  T-  4U R                  SS  -   5      $ )Nr   r
   )ÚreshaperŸ   Úexpand)Útensorrç   rù   r„   s     €€r7   Úextend_enc_outputÚ9RagTokenForGeneration.generate.<locals>.extend_enc_outputõ  sŒ   ø€ à˜D $ª˜MÑ*×2Ñ2°JÀÀ6Ð3JÈVÏ\É\ÐZ[ÐZ\ÐM]Ñ3]Ó^ˆFà—]‘] J°	¸6Ð#BÀVÇ\Á\ÐRSÐRTÐEUÑ#UÓVˆFà—>‘> :°	Ñ#9¸FÑ#BÐ"DÀvÇ|Á|ÐTUÐTVÐGWÑ"WÓXÐXr6   )rç   r–   r   r}   r„   )rf  Úinput_ids_seq_lengthÚencoder_input_idsrg  rh  r‰   )rf  ri  z)num_return_sequences has to be 1, but is z when doing greedy search.F)rh  ri  rf  Úsynced_gpusÚstreamerzA`num_return_sequences` has to be smaller or equal to `num_beams`.)rh  ri  rf  rt  uH   `num_beams` has to be an integer strictly superior to 0 (â‰¥ 1), but is r+   r˜   )&rf  ÚcopyÚdeepcopyÚupdaterZ   Ú_prepare_special_tokensrK   r„   rB   rP   r™   rš   r1   r›   rœ   rQ   r   r    r¡   r¢   r£   rŸ   r?   Úget_encoderÚfullrç   Údecoder_start_token_idÚlongÚnextÚ
parametersr‰   r¤   Ú_get_logits_processorÚ_get_stopping_criteriaræ   Ú
ValueErrorÚ_sampleÚ_beam_search)rx   r{   r|   r    r!   r   r„   rf  rg  rh  ri  r]   rõ   Úkwargs_has_attention_maskr÷   Úoutr   Úencoderr}   rr  rk  rp  Úpre_processorÚprepared_stopping_criteriarù   s         `                 @r7   rì   ÚRagTokenForGeneration.generatek  sO  ù€ ðb Ñ$Ø $× 6Ñ 6ÐÜ ŸMšMÐ*;Ó<ÐØ(×/Ò/Ñ9°&Ñ9ˆà$0×$4Ñ$4Ð5EÀtÓ$LÐTXÐ$XÐ!Ø×$Ñ$Ð%6ÔRð "Ñ-‘°4·;±;×3EÑ3Eˆð >‰>Ò%Ð*;Ò*CØ%)×%:Ñ%:¸9Ð%:Ð%dÐefÑ%gÐ"Ø—.‘.ØØ&×-Ñ-Ó/×2Ñ2¸%ÄuÇ}Á}Ð2ÐU×[Ñ[Ó]Ø—~‘~×,Ñ,×3Ñ3ØØ#ð !ð ˆCð Ð'Ñ(ØÐ,Ñ-ØÐ*Ñ+ð 8LÐ5ð $8×#:Ñ#:Ð;QÓ#RÐ Ø 1× 4Ñ 4°YÓ ?ÐØ%;×%>Ñ%>¸yÓ%IÐ"ô ŸšÐ#9×#CÑ#CÀAÓ#FÐH\×HfÑHfÐghÐjkÓHlÓm×uÑuØóˆJð "×'Ñ'¨Ñ*¨VÑ3¸Ó9ð 	
Ø[Ð\bÐ[cð dØ!×'Ñ'¨Ñ*Ð+¨1ð.ó	
Ð9ð '×,Ñ,¨QÑ/°6Ñ9ˆ
à—(‘(×$Ñ$×0Ñ0Ó2ˆÙ!Ð,=ÐrvÑwˆä—J’JØÐ+×5Ñ5Ñ5°qÐ9Ø×4Ñ4Ü—*‘*Ü˜Ÿ™Ó)Ó*×1Ñ1ñ	
ˆ	ð  )Ÿ™¨rÑ2ÐØ+Ð,?Ñ@Ð÷	Yð 	Yñ "3Ð3I×UpÑUpÑ!qÐÙ/@ØÐ):×)DÑ)Dñ0
ˆÐ+Ñ,ð  ×1Ñ1Ð2C×2MÑ2MÐSTÐ1ÐUˆ
ð &0\Ñ"Ø*9ˆÐ&Ñ'Ø)?Ð%Ñ&Ø!'ˆXÑà×2Ñ2Ø/Ø!5Ø/Ø%=Ø-Ø×#Ñ#ð 3ð 
ˆð &*×%@Ñ%@Ø/ð &Að &
Ð"ð ×&Ñ&¨!Ó+Ø ×5Ñ5¸Ó9Ü Ø?Ð@Q×@fÑ@fÐ?gð h&ð &óð ð —<’<Øðà!.Ø"<Ø"3Ø!Øñð ñð ð ×(Ñ(¨1Ó,Ø ×5Ñ5Ð8I×8SÑ8SÓSÜ Ð!dÓeÐeØ×$Ò$Øðà!.Ø"<Ø"3Ø!ñð ñð ô ØZÐ[l×[vÑ[vÐZwÐxóð r6   c                 óJ   • U R                   R                  R                  5       $ r˜   )r?   rQ   Úget_input_embeddingsrÛ   s    r7   rŒ  Ú*RagTokenForGeneration.get_input_embeddings7  s   € Øx‰x×!Ñ!×6Ñ6Ó8Ð8r6   c                 óJ   • U R                   R                  R                  5       $ r˜   )r?   rQ   Úget_output_embeddingsrÛ   s    r7   r  Ú+RagTokenForGeneration.get_output_embeddings:  s   € Øx‰x×!Ñ!×7Ñ7Ó9Ð9r6   c                 óL   • U R                   R                  R                  U5      $ r˜   )r?   rQ   Úset_output_embeddings)rx   Únew_embeddingss     r7   r’  Ú+RagTokenForGeneration.set_output_embeddings=  s   € Øx‰x×!Ñ!×7Ñ7¸ÓGÐGr6   c                 óÂ   • Uc  U R                   R                  nUR                  UR                  5      nUSS2SS24   R	                  5       USS2SS24'   X#SS2S4'   U$ )zCShift input ids one token to the right, and pad with start_token_idNr’   r   r   )rK   r|  Ú	new_zerosrŸ   Úclone)rx   r{   Ústart_token_idÚshifted_input_idss       r7   Úshift_tokens_rightÚ(RagTokenForGeneration.shift_tokens_right@  se   € àÑ!Ø!Ÿ[™[×?Ñ?ˆNØ%×/Ñ/°	·±Ó@ÐØ#,ªQ°°°¨VÑ#4×#:Ñ#:Ó#<Ðš!˜Q™R˜%Ñ Ø"0š!˜Q˜$ÑØ Ð r6   c                 óæ  ^ ^• Ub  UOT R                   R                  n[        R                  " TS S 2SS 24   TR	                  TR
                  S   S5      R                  T R                   R                  R                  5      /S5      mU U4S jnT R                  XU5      nTR                  S5      mTR                  5       UR                  5       :X  d   eUR                  STS9n	UR                  SSS9n
U" Xš5      u  pšU	R                  S5      n	U
R                  S5      n
U	* nU
* nU(       a   UR                  5       nUR                  5       nXXR                  S5      -  nSU-
  U-  XÜ-  -   nU$ )	Nr   r   c                 ó  >• TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ r  r	  r  s      €€r7   r  Ú1RagTokenForGeneration.get_nll.<locals>._mask_padsP  r  r6   r’   r  Tr  r  )rK   r„   r1   r  r  rŸ   r  rQ   rê   rb  r¡   r—   r!  r"  r   )rx   r$  r   r  rÑ   rÕ   r„   r  r+  r  r  r,  r-  r.  r   s   `  `           r7   rÖ   ÚRagTokenForGeneration.get_nllI  sW  ù€ Ø!Ñ-‘°4·;±;×3EÑ3Eˆä—’Ø’Aq‘rE‰]˜FŸJ™J v§|¡|°A¡¸Ó:×@Ñ@ÀÇÁ×AVÑAV×AcÑAcÓdÐeÐghó
ˆö	:ð ×'Ñ'¨
ÀÓGˆà×!Ñ! "Ó%ˆØz‰z‹|˜|×/Ñ/Ó1Ó1Ð1Ð1à× Ñ  R¨vÐ Ð6ˆØ!×%Ñ%¨"°dÐ%Ð;ˆ
Ù# BÓ3‰ˆØV‰VA‹YˆØ—^‘^ AÓ&ˆ
à3ˆØ!kˆæØ—|‘|“~ˆHØ%Ÿ/™/Ó+ˆKà×+Ñ+¨BÓ/Ñ/ˆØg‘ Ñ)¨EÑ,?Ñ?ˆØˆr6   r7  rµ   )NNNNNNr˜   r8  )Fr  N),r,   r-   r.   r/   r   r   r   r   rs   rÆ   rÍ   rF  r9  rB   rQ   rP   r;  r^  rb  r   r1   r4   r2   r   r¶   r·   r¸   r¹   r   r³   r:  r   r   r   r   r   rì   rŒ  r  r’  rš  rÖ   r5   rº   r»   s   @r7   r=  r=  K  sÔ  ø† ð .2Ø6:Ø/3Ø,0ñxàÐ)Ñ*ðxð # ?Ñ3ðxð ˜OÑ,ð	xð
 ˜LÑ)÷xð xð<' |ô 'ð+¸Oô +ð ØØØØØô
ð: ñ"ó ð"ð ñ"ó ð"ð ñ)ó ð)ð ñó ðô&	4ð ð 15Ø6:Ø@DØ8<Ø=AØ@DØ8<Ø=AØ26Ø$(Ø,0Ø/3Ø+/Ø)-Ø&*Ø-1Ø $ñ%j
à˜E×,Ñ,Ñ-ðj
ð ! ×!2Ñ!2Ñ3ðj
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ð	j
ð
 $ E×$4Ñ$4Ñ5ðj
ð !)¨×)9Ñ)9Ñ :ðj
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðj
ð $ E×$4Ñ$4Ñ5ðj
ð !)¨×)9Ñ)9Ñ :ðj
ð ˜U×.Ñ.Ñ/ðj
ð ˜D‘>ðj
ð $ D™>ðj
ð ' t™nðj
ð # 4™.ðj
ð ! ™ðj
ð  ˜d‘^ð!j
ð" ˜×)Ñ)Ñ*ð#j
ð$ ˜‘ð%j
ð( 
"ô)j
ó ðj
ðX ‡]‚]ƒ_ð 15Ø59Ø8<Ø=AØ26Ø $Ø8<ØW[Ù:MÓ:OÙ<PÓ<RñIà˜E×,Ñ,Ñ-ðIð ! ×!1Ñ!1Ñ2ðIð $ E×$4Ñ$4Ñ5ð	Ið
 !)¨×)9Ñ)9Ñ :ðIð ˜U×.Ñ.Ñ/ðIð ˜‘ðIð $Ð$4Ñ5ðIð #+¨8°S¸%¿,¹,Ð4GÈÈcÉÐ4RÑ+SÑ"TðIð #Ð#6Ñ7ðIð $Ð$8Ñ9ðIð 
×	Ñ	ôIó ðIòV9ò:òHô!÷"ò "r6   r=  )rn   r=   r½   r=  )(r0   rv  Údataclassesr   Útypingr   r   r   r   r   r1   r	   Úconfiguration_utilsr   Ú
generationr   r   r   r   Úmodeling_outputsr   Úmodeling_utilsr   Úutilsr   r   Úconfiguration_ragr   Úretrieval_ragr   Ú
get_loggerr,   Úloggerr   r9   r=   rn   r½   r=  Ú__all__r+   r6   r7   Ú<module>r¬     s4  ðñ  ã Ý !ß 9Õ 9ã Ý å 3ß fÓ fÝ +Ý -ß ,Ý (Ý 'ð 
×	Ò	˜HÓ	%€ð ô[O˜{ó [Oó ð[Oð| ôVO˜ó VOó ðVOñr ðñôQo˜ó QoóðQoðh ôX
Ð!ó X
ó ðX
ñv ðñô
mÐ1ó móð
mñ` ðñô
[Ð.°ó [óð
[ò| br6   