o
    Zh][                    @   sN  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ eeZ eG dd deZ!eG dd deZ"eddG dd deZ#eG dd de#Z$eddG dd de#Z%eddG dd de#eZ&g dZ'dS ) zRAG model implementation.    N)	dataclass)CallableListOptionalTupleUnion)nn   )PretrainedConfig)GenerationConfigGenerationMixinLogitsProcessorListStoppingCriteriaList)ModelOutput)PreTrainedModel)auto_docstringlogging   )	RagConfig)RagRetrieverc                   @   s  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )RetrievAugLMMarginOutputa  
    Base class for retriever augmented marginalized models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
            each vocabulary token.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
            (see `past_key_values` input) to speed up sequential decoding.
        retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
            Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
            the `doc_scores`.
        retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
            The indexes of the embedded documents retrieved by the retriever.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever.
        question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
            model.
        question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
        question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
        generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
        generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
        generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    Nlosslogits
doc_scorespast_key_valuesretrieved_doc_embedsretrieved_doc_idscontext_input_idscontext_attention_mask"question_encoder_last_hidden_state.question_enc_hidden_statesquestion_enc_attentionsgenerator_enc_last_hidden_stategenerator_enc_hidden_statesgenerator_enc_attentionsgenerator_dec_hidden_statesgenerator_dec_attentionsgenerator_cross_attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r   
LongTensorr   r   r   r    r   r!   r"   r#   r$   r%   r&   r'    r0   r0   S/var/www/auris/lib/python3.10/site-packages/transformers/models/rag/modeling_rag.pyr   $   s&   
 Jr   c                   @   sn  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeeejdf  ed< dZeeejdf  ed< dZeej ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )RetrievAugLMOutputa;  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
            each vocabulary token.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
            (see `past_key_values` input) to speed up sequential decoding.
        retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
            Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
            the `doc_scores`.
        retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
            The indexes of the embedded documents retrieved by the retriever.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever.
        question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
            model.
        question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
        question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
        generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
        generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
        generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    Nr   r   r   r   r   r   r   r   .r    r!   r"   r#   r$   r%   r&   r'   )r(   r)   r*   r+   r   r   r,   r-   r.   r   r   r   r   r   r/   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   r0   r0   r0   r1   r2      s$   
 Fr2   a  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
    )Zcustom_introc                
   @   sJ   e Zd ZeZdZdZdZe			d
de	e
 de	e
 dedefdd	ZdS )RagPreTrainedModelragTN.question_encoder_pretrained_model_name_or_path'generator_pretrained_model_name_or_path	retrieverreturnc                 K   s  dd |  D }dd |  D }| D ]}|d| = q| D ]}|d| = q"|dd}|du rh|dus<J dd	d
lm}	 d|vr_d	dlm}
 |
j|fi |ddi\}}||d< |	j|fi |}|dd}|du r|duszJ dd	dlm} d|vrd	dlm}
 |
j|fi |ddi\}}||d< |j|fi |}|	dd}|du rt
j|j|jfi |}| ||||dS )a  
        Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
        model checkpoints.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you need to first set it back in training mode with `model.train()`.

        Params:
            question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the question encoder. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the generator. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            model_args (remaining positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            retriever ([`RagRetriever`], *optional*):
                The retriever to use.
            kwwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`).

                - To update the question_encoder configuration, use the prefix *question_encoder_* for each
                  configuration parameter.
                - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
                - To update the parent model configuration, do not use a prefix for each configuration parameter.

                Behaves differently depending on whether a `config` is provided or automatically loaded.

        Example:

        ```python
        >>> from transformers import RagModel

        >>> # initialize a RAG from two pretrained models.
        >>> model = RagModel.from_pretrained_question_encoder_generator(
        ...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
        ... )
        >>> # saving model after fine-tuning
        >>> model.save_pretrained("./rag")
        >>> # load fine-tuned model
        >>> model = RagModel.from_pretrained("./rag")
        ```c                 S   ,   i | ]\}}| d r|td d |qS )question_encoder_N
startswithlen.0argumentvaluer0   r0   r1   
<dictcomp>0      zQRagPreTrainedModel.from_pretrained_question_encoder_generator.<locals>.<dictcomp>c                 S   r9   )
generator_Nr;   r>   r0   r0   r1   rB   6  rC   r:   rD   modelNznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined   	AutoModelconfig)
AutoConfigZreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be definedAutoModelForSeq2SeqLM)question_encoder	generatorrI   r7   )itemskeyspopauto.modeling_autorH   Zauto.configuration_autorJ   Zfrom_pretrainedrL   getr   'from_question_encoder_generator_configsrI   )clsr5   r6   r7   kwargsZkwargs_question_encoderZkwargs_generatorkeyrM   rH   rJ   Zquestion_encoder_configrN   rL   Zgenerator_configrI   r0   r0   r1   *from_pretrained_question_encoder_generator   sx   D

z=RagPreTrainedModel.from_pretrained_question_encoder_generator)NNN)r(   r)   r*   r   config_classZbase_model_prefixZ_supports_flash_attn_2Z_supports_sdpaclassmethodr   strr   r   rX   r0   r0   r0   r1   r3      s$    
r3   c                "       s  e Zd Z				ddee dee dee dee f fddZe														ddee	j
 d	ee	j d
eeee	j   dee	j
 dee	j deeee	j   dee	j dee	j
 dee	j
 dee dee dee dee dee deee	j ef fddZ  ZS )RagModelNrI   rM   rN   r7   c                    s  |dus|dur|dusJ d|du r!t j|j|jfi |}nt|| js2J d| d| j t | |du rHddlm} |	|j
}|du rXddlm} |	|j}|| _| jdurst|tspJ dt| j d	|| _|| _
|| _d| _d
| _dS )  
        question_encoder (`PreTrainedModel`, *optional*):
            The model responsible for encoding the question into hidden states for retrieval.
        generator (`PreTrainedModel`, *optional*):
            The model responsible for generating text based on retrieved documents.
        retriever (`RagRetriever`, *optional*):
            The component responsible for retrieving documents from a knowledge base given the encoded question.
        NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rF   rG   rK   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)r   rT   rI   
isinstancerY   super__init__rR   rH   Zfrom_configrM   rL   rN   r7   r   typectx_encodercontext_encoder_training)selfrI   rM   rN   r7   rV   rH   rL   	__class__r0   r1   r`   |  s6   "

zRagModel.__init__	input_idsattention_maskencoder_outputsdecoder_input_idsdecoder_attention_maskr   r   r   r   	use_cacheoutput_attentionsoutput_hidden_statesoutput_retrievedn_docsr8   c                 C   sr  |dur|n| j j}|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur.|n| j j}| jduoF|du pB|	du pB|du oF|du }|du r|r| j||dd}|d }| j|| j	dt
jd | jj j|dd}| jr|d	 |d
 |d |d |d |d f\}}	}}}}|	|}|		|}	|	|}|	|}| j||ddj}|d||jd }t
|d|ddd}nM|d	 |d
 |d |d f\}}	}}|	|}|	|}|		|}	t
|d|ddd}n|dusJ d|	dus
J d|dusJ d|dusJ d|jd | dks4J d| d|jd  d|dur@|j|dd}|durL|j|dd}| j||	|||||
|dd	}|shd}d}d}d}d}n|j}|j}|rt|s|d}d}	d}d}td)i d|jd|d|jd	|d
|	d|d|d |d!|d"|d#|jd$|jd%|j d&|j!d'|j"d(|j#S )*ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> outputs = model(input_ids=inputs["input_ids"])
        ```NT)rh   return_dictr   cpudevicedtypeptprefixrp   Zreturn_tensorsr   r   r   Ztokenized_doc_idsZtokenized_doc_attention_maskZdoc_idsr   rF   zMake sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.M The first dimension of `context_input_ids` should be a multiple of `n_docs`=	, but is .dim)	rg   rh   ri   rj   rk   r   rl   rm   rq   Nr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r0   )$rI   rp   rl   rm   rn   ro   r7   rM   detachtor,   float32numpyrN   rx   rc   rb   Zpooler_outputviewshapebmm	unsqueeze	transposesqueezerepeat_interleavehidden_statesZ
attentionsr2   r   r   Zencoder_last_hidden_stateZencoder_hidden_statesZencoder_attentionsZdecoder_hidden_statesZdecoder_attentionsZcross_attentions)rd   rg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   rp   Zhas_to_retrieveZquestion_enc_outputsr   Zretriever_outputsr   Zretrieved_doc_input_idsZretrieved_doc_attention_maskr   Zgen_outputsr    r!   r0   r0   r1   forward  s&  I

	









	
zRagModel.forwardNNNN)NNNNNNNNNNNNNN)r(   r)   r*   r   r
   r   r   r`   r   r,   r/   Tensorr   r-   
BoolTensorboolintr   r2   r   __classcell__r0   r0   re   r1   r\   z  sx    2	
r\   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c                &       s   e Zd Z				d2dee dee dee dee f fddZdefdd	Zd
efddZ	e
																	d3deej deej deeeej   deej deej deeeej   deej deej deej dee dee dee dee dee dee deej dee def$dd Zed!d" Zed#d$ Zed%d& Ze 									d4deej deej deej deej deej d'ee d(ee d)ee dee dejfd*d+Z	d5d.d/Zed0d1 Z  ZS )6RagSequenceForGenerationNrI   rM   rN   r7   c                    b   |dus|dur|dusJ d|du r t j|j|jfi |}t | t||||d| _dS r]   NzHEither a configuration or an encoder and a generator has to be provided.)rI   rM   rN   r7   r   rT   rI   r_   r`   r\   r4   rd   rI   rM   rN   r7   rV   re   r0   r1   r`     s   z!RagSequenceForGeneration.__init__c                 C      || j _d S r   r4   r7   rd   r7   r0   r0   r1   set_retriever     z&RagSequenceForGeneration.set_retrieverrb   c                 C      d| j _|| j _d S NTr4   rc   rb   rd   rb   r0   r0   r1    set_context_encoder_for_training     z9RagSequenceForGeneration.set_context_encoder_for_trainingrg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   exclude_bos_scorereduce_losslabelsrp   r8   c                 K   s6  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|du r(|}d}
| j||||||||	||
||||d}d}|durS| j|j|j||| j j||d}t	di d|d|jd|jd|j
d	|jd
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jS )a3  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        exclude_bos_score (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
            the loss.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )
        ```NFrg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   rp   )r   epsilonr   rp   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r0   )rI   rp   r   r   r4   get_nllr   r   label_smoothingr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   )rd   rg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   r   r   r   rp   rV   outputsr   r0   r0   r1   r     s   g
	
z RagSequenceForGeneration.forwardc                 C      | j jS r   r   rd   r0   r0   r1   r7   a     z"RagSequenceForGeneration.retrieverc                 C   r   r   r4   rN   r   r0   r0   r1   rN   e  r   z"RagSequenceForGeneration.generatorc                 C   r   r   r4   rM   r   r0   r0   r1   rM   i  r   z)RagSequenceForGeneration.question_encoderdo_deduplicationnum_return_sequences	num_beamsc
                 K   sT  |	dur|	n| j j}	|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dus4|dus4J d| jdurd|du rd| j||dd }| j|| jdt	j
d | jj j|	ddd	 }||}g }||
d
< ||
d< d|
d< |dur{|jd n|jd |	 }t|D ]}|||	 |d |	  }| jj|fi |
}|rt	tdd |D  }|jd }|dur|||d  |d}| ||dd}nC|dusJ d|dusJ d||d}|||	 |d |	  }||d}|||d ddf }||d}| ||||dd}|d  |d }|||  q| j|| j jjdS )a  
        Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
        for more information on how to set other generate input parameters.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
                retriever.
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
                `context_attention_mask` have to be provided to the forward pass. They are returned by
                [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
                provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
            do_deduplication (`bool`, *optional*):
                Whether or not to deduplicate the generations from different context documents for a given input. Has
                to be set to `False` if used while training with distributed backend.
            num_return_sequences(`int`, *optional*, defaults to 1):
                The number of independently computed returned sequences for each element in the batch. Note that this
                is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
                where we set `num_return_sequences` to `num_beams`.
            num_beams (`int`, *optional*, defaults to 1):
                Number of beams for beam search. 1 means no beam search.
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        Nz= At least one of input_ids or context_input_ids must be givenrh   r   rr   rs   rv   rw   r   r   r   rh   r   c                 S   s   i | ]	}t | |qS r0   )r[   tolist)r?   kr0   r0   r1   rB     s    z5RagSequenceForGeneration.generate.<locals>.<dictcomp>T)r   r   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r   r   r   r   r   r   )pad_token_id)rI   rp   r   r   r   r7   rM   r   r   r,   r   r   rN   rx   r   rangegeneratestacklistvaluesrepeatZtopkappend_cat_and_padr   )rd   rg   rh   r   r   r   r   r   r   rp   model_kwargsZnum_doc_return_sequencesquestion_hidden_statesZhypos
batch_sizeindexZgenerator_input_idsZoutput_sequencesZnum_candidatesZnew_input_idsr   Zindividual_input_idsZindividual_attention_maskZindividual_doc_scoresZtop_cand_indsr0   r0   r1   r   m  s~   A
	 

z!RagSequenceForGeneration.generateF        c                    sB  t d d dd f jd d jjjgd|d ur#|n jj} jj	p/ jjj	}|d uo@d d df 
| }	 fdd}
tjj|dd|jd | |d|d}tjj|dddd}|d d d d d dd d f }|d d d d ddd d f }|d d d d dd d d f }t j||| |gdd}ddd|dd | ksJ |jdd}|jdd	d
}|
||\}}|r|	r|d d d d dd f dn|d}|d}|d}|d}| }| }|r| }| }||d }d| | ||  }|S )Nr   r   c                    D     jjj}| r| |d ||d | d|dfS Nr   ry   eqrI   rN   r   anyZmasked_fill_r   ll
smooth_objZpad_maskrd   targetr0   r1   
_mask_pads  
   z4RagSequenceForGeneration.get_nll.<locals>._mask_padsry   r}   rF   r~   r   Tr~   Zkeepdim      ?)r,   catnewr   fill_rI   rN   r   rp   bos_token_idr   allr   
functionallog_softmaxr   sizer   r   r~   gathersum	logsumexp)rd   
seq_logitsr   r   r   r   r   rp   r   Zuse_bosr   seq_logprobsdoc_logprobsZfirst_token_scoresZsecond_token_scores	remainderrag_logprobsr   r   nll_losssmooth_losseps_ir   r0   r   r1   r     s@   2"   2


z RagSequenceForGeneration.get_nllc                 C   sv   | d  tdd | D tdd | D |}d}| D ]}|||||jd  d |jd f< ||jd 7 }q|S )Nr   c                 S      g | ]}|j d  qS )r   r   r?   tr0   r0   r1   
<listcomp>B      z9RagSequenceForGeneration._cat_and_pad.<locals>.<listcomp>c                 S   r   )r   r   r   r0   r0   r1   r   B  r   r   )r   r   maxr   r   )Ztensorsr   outputindr   r0   r0   r1   r   ?  s   0$z%RagSequenceForGeneration._cat_and_padr   NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr   FN)r(   r)   r*   r   r
   r   r   r`   r   r   r   r,   r/   r   r   r   r-   r   r   r   r   propertyr7   rN   rM   no_gradr   r   staticmethodr   r   r0   r0   re   r1   r     s    	
 !


	
 
;r   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c                &       s`  e Zd Z				d?dee dee dee dee f fddZdefdd	Zd
efddZ							d@ddZ
edd Zedd Zedd Zedd ZdAddZe																	dBdeej deej deeeej   deej deej deeeej   deej d eej d!eej d"ee d#ee d$ee d%ee d&ee d'ee d(eej d)ee d*ef$d+d,Ze dddddddde e f
deej deej deej d eej d!eej d)ee d-ee  d.ee!eejge"e f  d/ee d0ee d*ejfd1d2Z#d3d4 Z$d5d6 Z%d7d8 Z&dAd9d:Z'dCd=d>Z(  Z)S )DRagTokenForGenerationNrI   rM   rN   r7   c                    r   r   r   r   re   r0   r1   r`   Q  s   zRagTokenForGeneration.__init__c                 C   r   r   r   r   r0   r0   r1   r   o  r   z#RagTokenForGeneration.set_retrieverrb   c                 C   r   r   r   r   r0   r0   r1   r   r  r   z6RagTokenForGeneration.set_context_encoder_for_trainingc           	   
   K   s4   |d ur|d d dd f }d ||||||d|d	S )Nry   T)	rg   ri   r   r   rj   r   rl   do_marginalizerp   r0   )	rd   rj   r   rh   rl   ri   r   rp   rV   r0   r0   r1   prepare_inputs_for_generationv  s   z3RagTokenForGeneration.prepare_inputs_for_generationc                 C   r   r   r   r   r0   r0   r1   r7     r   zRagTokenForGeneration.retrieverc                 C   r   r   r   r   r0   r0   r1   rN     r   zRagTokenForGeneration.generatorc                 C   r   r   r   r   r0   r0   r1   rM     r   z&RagTokenForGeneration.question_encoderc                    s8   dd  d}| D ]}|t  fdd|D f7 }q|S )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                 S   s^   | j d |j d  }| jd|g| j dd  R  } | d|} | jdg| j dd  R  }|S )Nr   ry   r   rF   )r   r   Zindex_select)r   Z	new_orderrp   resultr0   r0   r1   _reorder_stacked  s
   z>RagTokenForGeneration._reorder_cache.<locals>._reorder_stackedr0   c                 3   s"    | ]} | |jV  qd S r   )r   rt   )r?   Z
past_stater   beam_idxr0   r1   	<genexpr>  s     z7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>)tuple)r   r   Zreordered_pastZ
layer_pastr0   r   r1   _reorder_cache  s   z$RagTokenForGeneration._reorder_cachec                 C   sp   |d ur|n| j j}tjj|dd|jd | |d|d}tj|dd}||	d	d }tj
|ddS )Nry   r}   r   r   )rI   rp   r   r   r   r   r   r   r,   r   r   )rd   r   r   rp   r   r   Zlog_prob_sumr0   r0   r1   marginalize  s   z!RagTokenForGeneration.marginalizerg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   r   r   r   rp   r8   c                 K   sX  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|du r(|}d}
| j||||||||	||
||||d}d}|j}|dur[|dusLJ | j|j|j||| j j|d}|re| 	||j|}t
di d|d|d|jd|jd	|jd
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        do_marginalize (`bool`, *optional*):
            If `True`, the logits are marginalized over all documents by making use of
            `torch.nn.functional.log_softmax`.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )

        >>> # or directly generate
        >>> generated = model.generate(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ... )
        >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
        ```NFr   )r   r   rp   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r0   )rI   rp   r   r   r4   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   )rd   rg   rh   ri   rj   rk   r   r   r   r   rl   rm   rn   ro   r   r   r   rp   rV   r   r   r   r0   r0   r1   r     s   o		
zRagTokenForGeneration.forwardgeneration_configprefix_allowed_tokens_fnlogits_processorstopping_criteriac                    s  |du r| j }t|}|jd&i |}|dddu}| || dur(n| jj| jdur|du r| j	||dd }| j||
 jdtjd | jjjdd}|d	 |d
 |d }}}||}||}||}t|d|ddd}|jd  dksJ d d|jd  d|jd   | jj }|||dd}tj |j df|jtjt|  jd}|jd }|d }d' fdd	}|||jd}|||jd|d< |j|jdd}||d< ||d< ||d< |d< | j |||||	|jd}| j!||
d}|jdkr2|j"dkr#t#d|j" d | j$|f|||d!dd"|S |jdkrQ|j"|jkrCt#d#| j%|f|||d!d$|S t#d%|j )(a  
        Implements RAG token decoding.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
                `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
                the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
                constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and a
                model's config. If a logit processor is passed that is already created with the arguments or a model's
                config an error is thrown.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                model's config. If a stopping criteria is passed that is already created with the arguments or a
                model's config an error is thrown.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model.

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        Nrh   r   r   rr   rs   rv   rw   r   r   r   r   rF   rz   r{   r|   T)rg   rh   rq   )ru   rt   ry   last_hidden_statec                    sl   | d d d d f   df| jdd   } |  |f| jdd   } |   |  f| jdd   S )Nr   r	   )Zreshaper   expand)Ztensorr   r   rp   r0   r1   extend_enc_output  s   ,"z9RagTokenForGeneration.generate.<locals>.extend_enc_output)r   r}   r   ri   rp   )r   input_ids_seq_lengthZencoder_input_idsr   r   rt   )r   r  z)num_return_sequences has to be 1, but is z when doing greedy search.F)r   r  r   synced_gpusstreamerzA`num_return_sequences` has to be smaller or equal to `num_beams`.)r   r  r   r  uH   `num_beams` has to be an integer strictly superior to 0 (≥ 1), but is r0   r   )&r   copydeepcopyupdaterS   Z_prepare_special_tokensrI   rp   r7   rM   r   r   r,   r   r   rN   rx   r   r   r   r   r   r4   Zget_encoderfullr   decoder_start_token_idlongnext
parametersrt   r   Z_get_logits_processorZ_get_stopping_criteriar   
ValueErrorZ_sampleZ_beam_search)rd   rg   rh   r   r   r   rp   r   r   r   r  rV   r   Zkwargs_has_attention_maskr   outr   encoderri   r  r  r  Zpre_processorZprepared_stopping_criteriar0   r  r1   r   k  s   Q





	
			
zRagTokenForGeneration.generatec                 C      | j j S r   )r4   rN   get_input_embeddingsr   r0   r0   r1   r  7  r   z*RagTokenForGeneration.get_input_embeddingsc                 C   r  r   )r4   rN   get_output_embeddingsr   r0   r0   r1   r  :  r   z+RagTokenForGeneration.get_output_embeddingsc                 C   s   | j j|S r   )r4   rN   set_output_embeddings)rd   Znew_embeddingsr0   r0   r1   r  =  s   z+RagTokenForGeneration.set_output_embeddingsc                 C   sX   |du r| j j}||j}|ddddf  |ddddf< ||dddf< |S )zCShift input ids one token to the right, and pad with start_token_idNry   r   r   )rI   r  Z	new_zerosr   clone)rd   rg   Zstart_token_idZshifted_input_idsr0   r0   r1   shift_tokens_right@  s   (z(RagTokenForGeneration.shift_tokens_rightFr   c                    s  |d ur|n j j}td d dd f jd d j jjgd fdd} 	|||}
d | ksDJ |jdd}	|jddd}
||	|
\}	}
|	d}	|
d}
|	 }|
 }|rs| }| }||d }d	| | ||  }|S )
Nr   r   c                    r   r   r   r   r   r0   r1   r   P  r   z1RagTokenForGeneration.get_nll.<locals>._mask_padsry   r   Tr   r   )rI   rp   r,   r   r   r   r   rN   r   r   r   r~   r   r   r   )rd   r   r   r   r   r   rp   r   r   r   r   r   r   r   r   r0   r   r1   r   I  s*   2


zRagTokenForGeneration.get_nllr   )NNNNNNr   r   )Fr   N)*r(   r)   r*   r   r
   r   r   r`   r   r   r   r   r7   rN   rM   r   r   r   r   r,   r/   r-   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r0   r0   re   r1   r   K  s    





	
 -	
 L
	r   )r\   r3   r   r   )(r+   r	  dataclassesr   typingr   r   r   r   r   r,   r   Zconfiguration_utilsr
   Z
generationr   r   r   r   Zmodeling_outputsr   Zmodeling_utilsr   utilsr   r   Zconfiguration_ragr   Zretrieval_ragr   Z
get_loggerr(   loggerr   r2   r3   r\   r   r   __all__r0   r0   r0   r1   <module>   sV   
^Y	      3    "