
    fTh][                       S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSK
Jr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  \R<                  " \5      r \ " S S\5      5       r!\ " S S\5      5       r"\" SS9 " S S\5      5       r#\ " S S\#5      5       r$\" SS9 " S S\#5      5       r%\" SS9 " S S\#\5      5       r&/ SQr'g) zRAG model implementation.    N)	dataclass)CallableListOptionalTupleUnion)nn   )PretrainedConfig)GenerationConfigGenerationMixinLogitsProcessorListStoppingCriteriaList)ModelOutput)PreTrainedModel)auto_docstringlogging   )	RagConfig)RagRetrieverc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                      \	S	'   Sr\\R                      \	S
'   Sr\\R                      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)RetrievAugLMMarginOutput$   a  
Base class for retriever augmented marginalized models outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_heads, sequence_length, embed_size_per_head)`).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
Nlosslogits
doc_scorespast_key_valuesretrieved_doc_embedsretrieved_doc_idscontext_input_idscontext_attention_mask"question_encoder_last_hidden_state.question_enc_hidden_statesquestion_enc_attentionsgenerator_enc_last_hidden_stategenerator_enc_hidden_statesgenerator_enc_attentionsgenerator_dec_hidden_statesgenerator_dec_attentionsgenerator_cross_attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r   
LongTensorr    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   r*   __static_attributes__r+       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/rag/modeling_rag.pyr   r   $   s   HT )-D(5$$
%,*.FHU&&'..2J**+29=OXd5#4#456=8<(5#4#45<48x 0 01848x 0 0189=HU%5%56=FJ&1B1B(CJJNu/@/@#/E)F GNGKXeE,=,=s,B&CDKCG#Xe.?.?%@GKO%0A0A30F*G!HOHLhuU->->-C'DELKO%0A0A30F*G!HOHLhuU->->-C'DELJNu/@/@#/E)F GNr6   r   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)RetrievAugLMOutput   aK  
Args:
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_heads, sequence_length, embed_size_per_head)`).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
Nr   r   r   r   r   r    r!   r"   .r#   r$   r%   r&   r'   r(   r)   r*   r+   )r,   r-   r.   r/   r0   r   r   r1   r2   r3   r   r   r   r   r   r4   r    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   r*   r5   r+   r6   r7   r9   r9      s   DL +/FHU&&'..2J**+29=OXd5#4#456=8<(5#4#45<48x 0 01848x 0 0189=HU%5%56=FJ&1B1B(CJJNu/@/@#/E)F GNGKXeE,=,=s,B&CDKCG#Xe.?.?%@GKO%0A0A30F*G!HOHLhuU->->-C'DELKO%0A0A30F*G!HOHLhuU->->-C'DELJNu/@/@#/E)F GNr6   r9   a  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
    )custom_introc            
       ^    \ rS rSr\rSrSrSr\	   SS\
\   S\
\   S\S\4S	 jj5       rS
rg)RagPreTrainedModel   ragTN.question_encoder_pretrained_model_name_or_path'generator_pretrained_model_name_or_path	retrieverreturnc                    UR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        VVs0 s H,  u  pVUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        H	  n	USU	-   	 M     UR                  5        H	  n	USU	-   	 M     UR	                  SS5      n
U
cL  Uc   S5       eSSKJn  SU;  a#  SS	KJn  UR                  " U40 UDS
S0D6u  pXS'   UR                  " U40 UD6n
UR	                  SS5      nUcN  Uc   S5       eSSKJ
n  SU;  a%  SS	KJn  UR                  " U40 UDS
S0D6u  nnUUS'   UR                  " U40 UD6nUR                  SS5      nUc,  [        R                  " U
R                  UR                  40 UD6nU " XUUS9$ s  snnf s  snnf )a+  
Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
model checkpoints.

The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
the model, you need to first set it back in training mode with `model.train()`.

Params:
    question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the question encoder. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
              this case, `from_tf` should be set to `True` and a configuration object should be provided as
              `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
              PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

    generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
        Information necessary to initiate the generator. Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
              this case, `from_tf` should be set to `True` and a configuration object should be provided as
              `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
              PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

    model_args (remaining positional arguments, *optional*):
        All remaining positional arguments will be passed to the underlying model's `__init__` method.
    retriever ([`RagRetriever`], *optional*):
        The retriever to use.
    kwwargs (remaining dictionary of keyword arguments, *optional*):
        Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
        `output_attentions=True`).

        - To update the question_encoder configuration, use the prefix *question_encoder_* for each
          configuration parameter.
        - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
        - To update the parent model configuration, do not use a prefix for each configuration parameter.

        Behaves differently depending on whether a `config` is provided or automatically loaded.

Example:

```python
>>> from transformers import RagModel

>>> # initialize a RAG from two pretrained models.
>>> model = RagModel.from_pretrained_question_encoder_generator(
...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./rag")
>>> # load fine-tuned model
>>> model = RagModel.from_pretrained("./rag")
```question_encoder_N
generator_modelznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined   	AutoModelconfig)
AutoConfigreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be definedAutoModelForSeq2SeqLM)question_encoder	generatorrK   rB   )items
startswithlenkeyspopauto.modeling_autorJ   auto.configuration_autorL   from_pretrainedrO   getr   'from_question_encoder_generator_configsrK   )clsr@   rA   rB   kwargsargumentvaluekwargs_question_encoderkwargs_generatorkeyrP   rJ   rL   question_encoder_configrQ   rO   generator_configrK   s                     r7   *from_pretrained_question_encoder_generator=RagPreTrainedModel.from_pretrained_question_encoder_generator   sl   L $*<<>#
#1""#67 8HS,-/0%7#1 	  #
 $*<<>
#1""<0 1HS&()50#1 	 
 +//1C*S01 2#((*C|c)* + 366wE#AM M 766@CMC]C]BD-D *.D@'
 5L1(88> BY  %(($7	:F !F C//@5?5O5O;6?O6fj62 "2 .> *-==7;KI
 Hd+>FF '')9)9=CF $4RXdmnnO#

s   GGG =G r+   )NNN)r,   r-   r.   r/   r   config_classbase_model_prefix_supports_flash_attn_2_supports_sdpaclassmethodr   strr   r   re   r5   r+   r6   r7   r=   r=      ss     L!N IMAE"&	Jo8@Jo 2:#Jo  	Jo 
Jo Jor6   r=   c            "         ^  \ rS rSr    SS\\   S\\   S\\   S\\   4U 4S jjjr\	              SS\\
R                     S\\
R                     S	\\\\
R                           S
\\
R                     S\\
R                     S\\\\
R                           S\\
R                     S\\
R                     S\\
R                     S\\   S\\   S\\   S\\   S\\   S\\\
R                     \4   4S jj5       rSrU =r$ )RagModeliz  rK   rP   rQ   rB   c                 b  > Uc  Ub  Uc   S5       eUc-  [         R                  " UR                  UR                  40 UD6nO1[        XR                  5      (       d   SU SU R                   35       e[
        TU ]  U5        Uc!  SSKJn  UR                  UR                  5      nUc!  SSKJn  UR                  UR                  5      nX@l        U R                  b9  [        U[        5      (       d   S[        U R                  5       S	35       eX@l        X l
        X0l        SU l        S
U l        g)  
question_encoder (`PreTrainedModel`, *optional*):
    The model responsible for encoding the question into hidden states for retrieval.
generator (`PreTrainedModel`, *optional*):
    The model responsible for generating text based on retrieved documents.
retriever (`RagRetriever`, *optional*):
    The component responsible for retrieving documents from a knowledge base given the encoded question.
NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rH   rI   rN   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)r   r[   rK   
isinstancerg   super__init__rW   rJ   from_configrP   rO   rQ   rB   r   typectx_encodercontext_encoder_training)	selfrK   rP   rQ   rB   r]   rJ   rO   	__class__s	           r7   rs   RagModel.__init__|  s8     !&6&ByG\ 	
_	
] >FF '')9)9=CF f&7&788sHVHL_`d`q`q_r:ss8 #6(44V5L5LMB-99&:J:JKI">>%i66 .tDNN/C.DDjk6 'N 0"(-%r6   	input_idsattention_maskencoder_outputsdecoder_input_idsdecoder_attention_maskr   r   r    r!   	use_cacheoutput_attentionsoutput_hidden_statesoutput_retrievedn_docsrC   c                 b   Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  SL=(       a%    USL =(       d    U	SL =(       d    USL =(       a    USL nUGc*  U(       Ga  U R                  XSS9nUS   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                   R                  USS9nU R                  (       a  US	   US
   US   US   US   US   4u  nn	nnnnUR                  U5      nU	R                  U5      n	UR                  U5      nUR                  U5      nU R!                  UUSS9R"                  nUR%                  SUUR&                  S   5      n[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nOUS	   US
   US   US   4u  pnnUR                  U5      nUR                  U5      nU	R                  U5      n	[        R(                  " UR+                  S5      UR-                  SS5      5      R/                  S5      nOUc   S5       eU	c   S5       eUc   S5       eUc   S5       eUR&                  S   U-  S:X  d   SU SUR&                  S    S35       eUb  UR1                  USS9nUb  UR1                  USS9nU R                  UU	UUUUU
USS9	nU(       d  SnSnSnSnSnOWR2                  nUR4                  nU(       a  U(       d  SnSn	SnSn[7        S)0 SUR8                  _SU_SUR:                  _S	U_S
U	_SW_SW_S W_S!U_S"U_S#UR<                  _S$UR>                  _S%UR@                  _S&URB                  _S'URD                  _S(URF                  _6$ )*a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"])
```NT)r|   return_dictr   cpudevicedtypeptprefixr   return_tensorsr    r!   r   tokenized_doc_idstokenized_doc_attention_maskdoc_idsr   rH   zMake sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.M The first dimension of `context_input_ids` should be a multiple of `n_docs`=	, but is .dim)	r{   r|   r}   r~   r   r   r   r   r   Nr   r   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )$rK   r   r   r   r   r   rB   rP   detachtor1   float32numpyrQ   r   rw   rv   pooler_outputviewshapebmm	unsqueeze	transposesqueezerepeat_interleavehidden_states
attentionsr9   r   r   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsdecoder_hidden_statesdecoder_attentionscross_attentions)rx   r{   r|   r}   r~   r   r   r   r    r!   r   r   r   r   r   has_to_retrievequestion_enc_outputsr"   retriever_outputsr   retrieved_doc_input_idsretrieved_doc_attention_maskr   gen_outputsr#   r$   s                             r7   forwardRagModel.forward  s   R "-4;;3E3E!*!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 0@/K+QUQ\Q\QmQm NN$& ("d*b.D.LbPZ^bPb(4' 	 "'+'<'<$ (= ($ 6J!5L2$(NN6==?BB%W\WdWdBekkm>>0077!#' %3 %! 00 **=>)*BC)*@A)*=>)*HI))4).,/4) ):(<(<Y(G%-C-F-Fy-Q*.E.H.H.S+3O3R3RS\3]0+/+;+;/@\jn ,< ,#m ) ,@+D+DF$F$L$LQ$O,(
 "':DDQGI]IgIghiklIm"gaj  **=>)*BC)*@A))4	jf%?SUf ,@+B+BCe+f((9(<(<Y(G%-C-F-Fy-Q* "':DDQGI]IgIghiklIm"gaj  )4 P4 .9 T9 "- J-
 % 	
l	
%   #f,2 	
[\b[c d!''*+1.	
2 ( 1 C CFPQ C R!-%;%M%MfZ[%M%\"nn'1+/#9+/ % 

 15.)-&&*##'  $)=)K)K&&:&E&E#&6 '%)"#'  $! 
%%
!
 (77
 0	

 $:
 "6
 0
 0R
 (B
 %<
 -8,Q,Q
 )4(I(I
 &1%C%C
 )4(I(I
 &1%C%C
  (3'C'C!
 	
r6   )rw   rv   rQ   rP   rB   NNNN)NNNNNNNNNNNNNN)r,   r-   r.   r/   r   r   r   r   rs   r   r1   r4   Tensorr   r2   
BoolTensorboolintr   r9   r   r5   __classcell__ry   s   @r7   rn   rn   z  s    .26:/3,00.)*0. #?30. O,	0.
 L)0. 0.d  1515EI8<=AEI268<=A$(,0/3+/ $d
E,,-d
 !.d
 "%e.?.?(@"AB	d

 $E$4$45d
 !))9)9 :d
 "%e.?.?(@"ABd
 U../d
 $E$4$45d
 !))9)9 :d
 D>d
 $D>d
 'tnd
 #4.d
 d
  
uU\\"$66	7!d
 d
r6   rn   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c            &         ^  \ rS rSr    S'S\\   S\\   S\\   S\\   4U 4S jjjrS\4S jr	S\4S	 jr
\                 S(S
\\R                     S\\R                     S\\\\R                           S\\R                     S\\R                      S\\\\R                           S\\R                     S\\R                     S\\R"                     S\\   S\\   S\\   S\\   S\\   S\\   S\\R                     S\\   S\4$S jj5       r\S 5       r\S 5       r\S 5       r\R4                  " 5                S)S
\\R                     S\\R                     S\\R                     S\\R                     S\\R"                     S \\   S!\\   S"\\   S\\   S\R                  4S# jj5       r S*S$ jr\S% 5       rS&rU =r $ )+RagSequenceForGenerationi  rK   rP   rQ   rB   c                    > Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        grp   NzHEither a configuration or an encoder and a generator has to be provided.)rK   rP   rQ   rB   r   r[   rK   rr   rs   rn   r?   rx   rK   rP   rQ   rB   r]   ry   s         r7   rs   !RagSequenceForGeneration.__init__  sv      !&6&ByG\ 	
V	
] >FF '')9)9=CF 	  6Xawr6   c                 $    XR                   l        g r   r?   rB   rx   rB   s     r7   set_retriever&RagSequenceForGeneration.set_retriever      &r6   rv   c                 F    SU R                   l        XR                   l        g NTr?   rw   rv   rx   rv   s     r7    set_context_encoder_for_training9RagSequenceForGeneration.set_context_encoder_for_training      ,0)*r6   r{   r|   r}   r~   r   r   r    r!   r   r   r   r   r   exclude_bos_scorereduce_losslabelsr   rC   c                 ,   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUb=  U R                  UR                  UR                  UUU R                   R                  UUS9n[        S0 SU_SUR                  _SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
exclude_bos_score (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
    the loss.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )
```NFr{   r|   r}   r~   r   r    r!   r   r   r   r   r   r   r   )r   epsilonr   r   r   r   r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )rK   r   r   r   r?   get_nllr   r   label_smoothingr   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   )rx   r{   r|   r}   r~   r   r   r    r!   r   r   r   r   r   r   r   r   r   r]   outputsr   s                        r7   r    RagSequenceForGeneration.forward  s   N "-4;;3E3E1B1N-TXT_T_TqTq%0%<k$++BYBY ($*!I(()+/#9/#9!+/!5-  
" <<""!'33"3   D ( 

>>
 ))
 $33	

 &77
 $+#A#A
 ")!=!=
 &77
 07/Y/Y
 (/'I'I
 %,$C$C
 -4,S,S
 )0(K(K
 &-%E%E
 )0(K(K
  &-%E%E!
" (/'I'I#
 	
r6   c                 .    U R                   R                  $ r   r   rx   s    r7   rB   "RagSequenceForGeneration.retrievera      xx!!!r6   c                 .    U R                   R                  $ r   r?   rQ   r   s    r7   rQ   "RagSequenceForGeneration.generatore  r   r6   c                 .    U R                   R                  $ r   r?   rP   r   s    r7   rP   )RagSequenceForGeneration.question_encoderi      xx(((r6   do_deduplicationnum_return_sequences	num_beamsc
                    U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc
  Uc   S5       eU R
                  b  Uc  U R                  XS9S   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                   R                  U	SS9S	   nUR                  U5      n/ nXS
'   XS'   SU
S'   Ub  UR                  S   OUR                  S   U	-  n[        U5       GHW  nX?U	-  US-   U	-   nU R                  R                   " U40 U
D6nU(       aV  [        R"                  " [%        U Vs0 s H  n['        UR)                  5       5      U_M     snR+                  5       5      5      nUR                  S   nUb   XUS-    R-                  US5      nU " UUSS9nOnUc   S5       eUc   S5       eUR-                  US5      nXOU	-  US-   U	-   nUR-                  US5      nX_US-   2SS24   nUR-                  US5      nU " UUUUSS9nUS   * R/                  U5      S   nUR1                  UU   5        GMZ     U R3                  XR                   R                  R4                  S9$ s  snf )a  
Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
for more information on how to set other generate input parameters.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
        retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
        `context_attention_mask` have to be provided to the forward pass. They are returned by
        [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
        provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
    do_deduplication (`bool`, *optional*):
        Whether or not to deduplicate the generations from different context documents for a given input. Has
        to be set to `False` if used while training with distributed backend.
    num_return_sequences(`int`, *optional*, defaults to 1):
        The number of independently computed returned sequences for each element in the batch. Note that this
        is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
        where we set `num_return_sequences` to `num_beams`.
    num_beams (`int`, *optional*, defaults to 1):
        Number of beams for beam search. 1 means no beam search.
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    kwargs (`Dict[str, Any]`, *optional*):
        Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
Nz= At least one of input_ids or context_input_ids must be givenr|   r   r   r   r   r   r    r   r   r|   r   T)r   r   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r    r!   r   r   r   r   )pad_token_id)rK   r   r   r   r   rB   rP   r   r   r1   r   r   rQ   r   r   rangegeneratestacklistrl   tolistvaluesrepeattopkappend_cat_and_padr   )rx   r{   r|   r    r!   r   r   r   r   r   model_kwargsnum_doc_return_sequencesquestion_hidden_stateshypos
batch_sizeindexgenerator_input_idsoutput_sequencesknum_candidatesnew_input_idsr   individual_input_idsindividual_attention_maskindividual_doc_scorestop_cand_indss                             r7   r   !RagSequenceForGeneration.generatem  sf   B "-4;;3E3E/?/K+QUQ\Q\QmQm$8$D $++JjJj 	! "+!6IDKK<Q<Q	$(9(E 	
K	
E >>%*;*C%)%:%:9%:%def%g" $&--/22%u}}2U[[]~~,,33# !/ ! "!# !2 4 4Y ?$-[!/8+,)-%&+4+@Y__Q'FWF]F]^_F`djFj
:&E"3FNeaiSYEY"Z#~~66#    #(;;tQa4bQaAS_a5GQa4b4i4i4k/l#m -33N
 $ )%!) < C CNTU V}5EY]^-9 T9 "- J-
 (;'A'A"A($ -C6>UZ]^U^bhTh,i),E,L,L^]^,_)(2EAI3F3I(J%(=(D(D^UV(W%&:+D4+&* &fo-334LMaPM LL)-89g 'j   [[5J5J5W5W XXW 5cs   %$K c                 2  ^ ^ [         R                  " TS S 2SS 24   TR                  TR                  S   S5      R	                  T R
                  R                  R                  5      /S5      mUb  UOT R
                  R                  nT R
                  R                  =(       d     T R
                  R                  R                  nUS L=(       a&    TS S 2S4   R                  U5      R                  5       n	U U4S jn
[        R                  R                  USS9R                  UR                  S   U-  USUR!                  S5      5      n[        R                  R                  USS9R#                  S5      R#                  S5      nUS S 2S S 2S S2S S 24   nUS S 2S S 2SS2S S 24   nUS S 2S S 2SS 2S S 24   n[         R                  " XU-   U/SS9nTR#                  S5      R#                  S5      R%                  SUSS5      mTR'                  5       UR'                  5       :X  d   eUR)                  STS9nUR+                  SSS	9nU
" UU5      u  nnU(       a$  U	(       a  US S 2S S 2SS 24   R+                  S5      OUR+                  S5      nUR+                  S5      nUR-                  S5      nUR-                  S5      nU* nU* nU(       a   UR+                  5       nUR+                  5       nUUR!                  S5      -  nS
U-
  U-  UU-  -   nU$ )Nr   r   c                   > TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ N        r   eqrK   rQ   r   anymasked_fill_r   ll
smooth_objpad_maskrx   targets      r7   
_mask_pads4RagSequenceForGeneration.get_nll.<locals>._mask_pads  h    yy!6!6!C!CDH||~~#.''#6::b>:#5#5b#999r6   r   r   rH   r   r   Tr   keepdim      ?)r1   catnewr   fill_rK   rQ   r   r   bos_token_idr
  allr	   
functionallog_softmaxr   sizer   r   r   gathersum	logsumexp)rx   
seq_logitsr   r  r   r   r   r   r  use_bosr  seq_logprobsdoc_logprobsfirst_token_scoressecond_token_scores	remainderrag_logprobsr  r  nll_losssmooth_losseps_ir   s   `  `                   r7   r    RagSequenceForGeneration.get_nll  s    AqrE]FJJv||A:@@AVAVAcAcdegh
 "-4;;3E3E {{//U4;;3H3H3U3Ud*Rvad||/L/P/P/R	: }}000DIIQ6)62zr7J
 }}000CMMbQ[[\^_ *!QA+6*1a1a<8 Aqr1-	yy"4L6XZc!djkl !!!$..r299!VQJzz||//1111  Rv 6!%%"d%;
#B
3J %6'R1ab\a rvvay^^A&
\\!_))!,
3!k||~H%//+K,++B//g)EK,??r6   c           
      ~   U S   R                  [        U  Vs/ s H  o"R                  S   PM     sn5      [        U  Vs/ s H  o"R                  S   PM     sn5      5      R	                  U5      nSnU  H:  nX#XDUR                  S   -   2S UR                  S   24'   XBR                  S   -  nM<     U$ s  snf s  snf )Nr   r   )r  r"  r   maxr  )tensorsr   toutputinds        r7   r   %RagSequenceForGeneration._cat_and_pad?  s     AJNN3G<Gq
G<=sX_C`X_STGGAJX_C`?abhhiuv 	 A;<3qwwqz))<QWWQZ<78771:C    =C`s
   B5B:r?   r   NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr  FN)!r,   r-   r.   r/   r   r   r   r   rs   r   r   r   r1   r4   r   r   r   r2   r   r   r   r   propertyrB   rQ   rP   no_gradr   r   staticmethodr   r5   r   r   s   @r7   r   r     sx    .26:/3,0x)*x #?3x O,	x
 L)x x:'| '+O +  1515@D8<=A@D8<=A26$(,0/3+/,0&*-1 $%^
E,,-^
 !.^
 "%ell(;"<=	^

 $E$4$45^
 !))9)9 :^
 "%ell(;"<=^
 $E$4$45^
 !))9)9 :^
 U../^
 D>^
 $D>^
 'tn^
 #4.^
 $D>^
  d^!^
" ))*#^
$ %^
( 
")^
 ^
@ " " " " ) ) ]]_ 15598<=A26+/.2#' $TYE,,-TY !!1!12TY $E$4$45	TY
 !))9)9 :TY U../TY #4.TY 'smTY C=TY TY 
		TY TYn os9v  r6   r   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c            &       0  ^  \ rS rSr    S/S\\   S\\   S\\   S\\   4U 4S jjjrS\4S jr	S	\4S
 jr
      S0S jr\S 5       r\S 5       r\S 5       r\S 5       rS1S jr\                 S2S\\R*                     S\\R,                     S\\\\R0                           S\\R*                     S\\R2                     S\\\\R0                           S\\R*                     S\\R*                     S\\R,                     S\\   S\\   S\\   S\\   S\\   S\\   S \\R*                     S!\\   S"\4$S# jj5       r\R<                  " 5       SSSSSSSS\" 5       \ " 5       4
S\\R*                     S\\R*                     S\\R*                     S\\R*                     S\\R,                     S!\\   S$\\!   S%\\"\\R0                  /\#\   4      S&\\   S'\\    S"\R*                  4S( jj5       r$S) r%S* r&S+ r'S1S, jr(S3S- jr)S.r*U =r+$ )4RagTokenForGenerationiK  NrK   rP   rQ   rB   c                    > Uc  Ub  Uc   S5       eUc,  [         R                  " UR                  UR                  40 UD6n[        TU ]  U5        [        XX4S9U l        gr   r   r   s         r7   rs   RagTokenForGeneration.__init__Q  sv      !&6&ByG\ 	
V	
] >FF '')9)9=CF 	  6Xawr6   c                 $    XR                   l        g r   r   r   s     r7   r   #RagTokenForGeneration.set_retrievero  r   r6   rv   c                 F    SU R                   l        XR                   l        g r   r   r   s     r7   r   6RagTokenForGeneration.set_context_encoder_for_trainingr  r   r6   c           
      6    Ub  US S 2SS 24   nS UUUUUUSUS.	$ )Nr   T)	r{   r}   r   r!   r~   r   r   do_marginalizer   r+   )	rx   r~   r   r|   r   r}   r   r   r]   s	            r7   prepare_inputs_for_generation3RagTokenForGeneration.prepare_inputs_for_generationv  sB     & 1!RS& 9 .$&4!2.""

 
	
r6   c                 .    U R                   R                  $ r   r   r   s    r7   rB   RagTokenForGeneration.retriever  r   r6   c                 .    U R                   R                  $ r   r   r   s    r7   rQ   RagTokenForGeneration.generator  r   r6   c                 .    U R                   R                  $ r   r   r   s    r7   rP   &RagTokenForGeneration.question_encoder  r   r6   c                 Z   ^^ S mSnU  H  nU[        UU4S jU 5       5      4-  nM     U$ )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                     U R                   S   UR                   S   -  nU R                  " SU/U R                   SS  Q76 n U R                  SU5      n U R                  " S/U R                   SS  Q76 nU$ )Nr   r   r   rH   )r   r   index_select)r   	new_orderr   results       r7   _reorder_stacked>RagTokenForGeneration._reorder_cache.<locals>._reorder_stacked  s    "((+yq/AAF)..r6TM<O<OPQPR<STM)66q)DM"''E]-@-@-DEFMr6   r+   c              3   h   >#    U  H'  nT" UTR                  UR                  5      5      v   M)     g 7fr   )r   r   ).0
past_staterS  beam_idxs     r7   	<genexpr>7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>  s.     peoWa&z8;;z?P?P3QRReos   /2)tuple)r   rX  reordered_past
layer_pastrS  s    `  @r7   _reorder_cache$RagTokenForGeneration._reorder_cache  s?    	 )Jpeopp N * r6   c                 n   Ub  UOU R                   R                  n[        R                  R	                  USS9R                  UR                  S   U-  USUR                  S5      5      n[        R                  " USS9nXER                  S5      R                  S5      -   n[        R                  " USS9$ )Nr   r   r   r   )rK   r   r	   r  r  r   r   r   r1   r   r#  )rx   r$  r   r   r&  r'  log_prob_sums          r7   marginalize!RagTokenForGeneration.marginalize  s    !-4;;3E3E }}000DIIQ6)62zr7J
 ((;#&<&<R&@&J&J2&NN|33r6   r{   r|   r}   r~   r   r   r    r!   r   r   r   r   r   rE  r   r   r   rC   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Uc  UnSn
U R	                  UUUUUUUU	UU
UUUUS9nSnUR
                  nUbA  Uc   eU R                  UR
                  UR                  UUU R                   R                  US9nU(       a  U R                  UUR                  U5      n[        S0 SU_SU_SUR                  _SUR                  _S	UR                  _S
UR                  _SUR                  _SUR                  _SUR                   _SUR"                  _SUR$                  _SUR&                  _SUR(                  _SUR*                  _SUR,                  _SUR.                  _SUR0                  _6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
    which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
    obtain the indices.

    [What are input IDs?](../glossary#input-ids)
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
    Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
    *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
    sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
    generator's encoder.

    Used by the ([`RagModel`]) model during decoding.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Provide for generation tasks. `None` by default, construct as per instructions for the generator model
    you're using with your RAG instance.
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
    Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
    the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
    Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
    retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
    provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
    Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
    `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
    has to be provided to the forward pass. `doc_scores` can be computed via
    `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
output_retrieved (`bool`, *optional*):
    Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
    `context_attention_mask`. See returned tensors for more detail.
do_marginalize (`bool`, *optional*):
    If `True`, the logits are marginalized over all documents by making use of
    `torch.nn.functional.log_softmax`.
reduce_loss (`bool`, *optional*):
    Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
    operation.
n_docs (`int`, *optional*):
    The number of documents to retrieve.

Example:

```python
>>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
>>> retriever = RagRetriever.from_pretrained(
...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
... )
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

>>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
>>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = inputs["input_ids"]
>>> labels = targets["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=labels)

>>> # or use retriever separately
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(
...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
... ).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
...     decoder_input_ids=labels,
... )

>>> # or directly generate
>>> generated = model.generate(
...     context_input_ids=docs_dict["context_input_ids"],
...     context_attention_mask=docs_dict["context_attention_mask"],
...     doc_scores=doc_scores,
... )
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
```NFr   )r   r   r   r   r   r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   )rK   r   rE  r   r?   r   r   r   r   rb  r   r   r    r!   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   )rx   r{   r|   r}   r~   r   r   r    r!   r   r   r   r   r   rE  r   r   r   r]   r   r   r   s                         r7   r   RagTokenForGeneration.forward  s"   ^ "-4;;3E3E+9+E4;;KeKe%0%<k$++BYBY ($*!I(()+/#9/#9!+/!5-  
" $000<<""'33   D %%fg.@.@&IF' 


 ))
 $33	

 &77
 $+#A#A
 ")!=!=
 &77
 07/Y/Y
 (/'I'I
 %,$C$C
 -4,S,S
 )0(K(K
 &-%E%E
 )0(K(K
  &-%E%E!
" (/'I'I#
 	
r6   generation_configprefix_allowed_tokens_fnlogits_processorstopping_criteriac           	      V  ^^ Uc  U R                   n[        R                  " U5      nUR                  " S%0 UD6nUR	                  SS5      SLnU R                  X}5        Tb  TOU R                  R                  mU R                  Gb  UGc  U R                  XS9S   nU R                  UUR                  5       R                  S[        R                  S9R                  5       U R                  R                  R                   TSS9nUS	   US
   US   npCUR                  U5      nUR                  U5      nUR                  U5      n[        R"                  " UR%                  S5      UR'                  SS5      5      R)                  S5      nUR*                  S   T-  S:X  d   ST SUR*                  S    S35       eUR*                  S   T-  mU R,                  R                  R/                  5       nU" X4SS9n[        R0                  " TUR2                  -  S4UR4                  [        R6                  [9        U R;                  5       5      R<                  S9nUR*                  S   nUS   nS&UU4S jjnU" XGR2                  S9nU" UUR2                  S9US'   UR?                  UR2                  SS9nX\S'   UUS'   XLS'   TUS'   U RA                  UUUUU	UR<                  S9nU RC                  XzS9nUR2                  S:X  aB  URD                  S:  a  [G        SURD                   S35      eU RH                  " U4UUUS SS!.UD6$ UR2                  S:  a=  URD                  UR2                  :  a  [G        S"5      eU RJ                  " U4UUUS S#.UD6$ [G        S$UR2                   35      e)'a	  
Implements RAG token decoding.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        The sequence used as a prompt for the generation. If `input_ids` is not passed, then
        `context_input_ids` has to be provided.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.

        If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
        forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
    n_docs (`int`, *optional*, defaults to `config.n_docs`)
        Number of documents to retrieve and/or number of documents for which to generate an answer.
    generation_config (`~generation.GenerationConfig`, *optional*):
        The generation configuration to be used as base parametrization for the generation call. `**kwargs`
        passed to generate matching the attributes of `generation_config` will override them. If
        `generation_config` is not provided, the default will be used, which has the following loading
        priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
        configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
        default values, whose documentation should be checked to parameterize generation.
    prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
        If provided, this function constraints the beam search to allowed tokens only at each step. If not
        provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
        `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
        the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
        constrained generation conditioned on the prefix, as described in [Autoregressive Entity
        Retrieval](https://arxiv.org/abs/2010.00904).
    logits_processor (`LogitsProcessorList`, *optional*):
        Custom logits processors that complement the default logits processors built from arguments and a
        model's config. If a logit processor is passed that is already created with the arguments or a model's
        config an error is thrown.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        Custom stopping criteria that complement the default stopping criteria built from arguments and a
        model's config. If a stopping criteria is passed that is already created with the arguments or a
        model's config an error is thrown.
    kwargs (`Dict[str, Any]`, *optional*):
        Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
        forwarded to the `forward` function of the model.

Return:
    `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
    sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
    finished early due to the `eos_token_id`.
Nr|   r   r   r   r   r   r   r    r!   r   r   rH   r   r   r   T)r{   r|   r   )r   r   r   last_hidden_statec                    > U S S S S 24   R                  TST4U R                  SS  -   5      n U R                  TUT4U R                  SS  -   5      n U R                  TU-  T-  4U R                  SS  -   5      $ )Nr   r
   )reshaper   expand)tensorr   r   r   s     r7   extend_enc_output9RagTokenForGeneration.generate.<locals>.extend_enc_output  s    D$M*22J63JV\\Z[Z\M]3]^F]]J	6#BV\\RSRTEU#UVF>>:	#9F#B"Dv||TUTVGW"WXXr6   )r   r   r   r}   r   )rf  input_ids_seq_lengthencoder_input_idsrg  rh  r   )rf  ri  z)num_return_sequences has to be 1, but is z when doing greedy search.F)rh  ri  rf  synced_gpusstreamerzA`num_return_sequences` has to be smaller or equal to `num_beams`.)rh  ri  rf  rt  uH   `num_beams` has to be an integer strictly superior to 0 (≥ 1), but is r+   r   )&rf  copydeepcopyupdaterZ   _prepare_special_tokensrK   r   rB   rP   r   r   r1   r   r   rQ   r   r   r   r   r   r   r?   get_encoderfullr   decoder_start_token_idlongnext
parametersr   r   _get_logits_processor_get_stopping_criteriar   
ValueError_sample_beam_search)rx   r{   r|   r    r!   r   r   rf  rg  rh  ri  r]   r   kwargs_has_attention_maskr   outr   encoderr}   rr  rk  rp  pre_processorprepared_stopping_criteriar   s         `                 @r7   r   RagTokenForGeneration.generatek  sO   b $ $ 6 6 MM*;<(//9&9$0$4$45Et$LTX$X!$$%6R "-4;;3E3E >>%*;*C%)%:%:9%:%def%g"..&--/22%u}}2U[[]~~,,33# ! C '(,-*+ 8L5 $8#:#:;Q#R  1 4 4Y ?%;%>%>y%I" #9#C#CA#FH\HfHfghjkHlmuuJ "''*V39 	
[\b[c d!''*+1.	
9 ',,Q/69
(($$002!,=rvwJJ+555q944**)*11	
	  )r2+,?@	Y 	Y "33IUpUp!q/@):)D)D0
+,  112C2M2MST1U
 &0\"*9&')?%&!'X22/!5/%=-## 3 
 &*%@%@/ &A &
" &&!+ 559 ?@Q@f@f?g h& &  <<!."<"3!   ((1, 558I8S8SS !dee$$!."<"3!   Z[l[v[vZwx r6   c                 J    U R                   R                  R                  5       $ r   )r?   rQ   get_input_embeddingsr   s    r7   r  *RagTokenForGeneration.get_input_embeddings7  s    xx!!6688r6   c                 J    U R                   R                  R                  5       $ r   )r?   rQ   get_output_embeddingsr   s    r7   r  +RagTokenForGeneration.get_output_embeddings:  s    xx!!7799r6   c                 L    U R                   R                  R                  U5      $ r   )r?   rQ   set_output_embeddings)rx   new_embeddingss     r7   r  +RagTokenForGeneration.set_output_embeddings=  s    xx!!77GGr6   c                     Uc  U R                   R                  nUR                  UR                  5      nUSS2SS24   R	                  5       USS2SS24'   X#SS2S4'   U$ )zCShift input ids one token to the right, and pad with start_token_idNr   r   r   )rK   r|  	new_zerosr   clone)rx   r{   start_token_idshifted_input_idss       r7   shift_tokens_right(RagTokenForGeneration.shift_tokens_right@  se    !![[??N%//	@#,QV#4#:#:#<!QR% "0!Q$  r6   c                   ^ ^ Ub  UOT R                   R                  n[        R                  " TS S 2SS 24   TR	                  TR
                  S   S5      R                  T R                   R                  R                  5      /S5      mU U4S jnT R                  XU5      nTR                  S5      mTR                  5       UR                  5       :X  d   eUR                  STS9n	UR                  SSS9n
U" X5      u  pU	R                  S5      n	U
R                  S5      n
U	* nU
* nU(       a   UR                  5       nUR                  5       nXXR                  S5      -  nSU-
  U-  X-  -   nU$ )	Nr   r   c                   > TR                  TR                  R                  R                  5      nUR	                  5       (       a$  U R                  US5        UR                  US5        U R                  S5      UR                  S5      4$ r  r	  r  s      r7   r  1RagTokenForGeneration.get_nll.<locals>._mask_padsP  r  r6   r   r  Tr  r  )rK   r   r1   r  r  r   r  rQ   r   rb  r   r   r!  r"  r   )rx   r$  r   r  r   r   r   r  r+  r  r  r,  r-  r.  r   s   `  `           r7   r   RagTokenForGeneration.get_nllI  sW   !-4;;3E3EAqrE]FJJv||A:@@AVAVAcAcdegh
	: ''
G!!"%zz||//1111  Rv 6!%%"d%;
#B3VVAY^^A&
3!k||~H%//+K++B//g)E,??r6   r7  r   )NNNNNNr   r8  )Fr  N),r,   r-   r.   r/   r   r   r   r   rs   r   r   rF  r9  rB   rQ   rP   r;  r^  rb  r   r1   r4   r2   r   r   r   r   r   r   r   r:  r   r   r   r   r   r   r  r  r  r  r   r5   r   r   s   @r7   r=  r=  K  s    .26:/3,0x)*x #?3x O,	x
 L)x x<'| '+O + 
: " " " " ) )  &	4  156:@D8<=A@D8<=A26$(,0/3+/)-&*-1 $%j
E,,-j
 !!2!23j
 "%ell(;"<=	j

 $E$4$45j
 !))9)9 :j
 "%ell(;"<=j
 $E$4$45j
 !))9)9 :j
 U../j
 D>j
 $D>j
 'tnj
 #4.j
 !j
  d^!j
" ))*#j
$ %j
( 
")j
 j
X ]]_ 15598<=A26 $8<W[:M:O<P<RIE,,-I !!1!12I $E$4$45	I
 !))9)9 :I U../I I $$45I #+8S%,,4Gc4R+S"TI ##67I $$89I 
		I IV9:H!" "r6   r=  )rn   r=   r   r=  )(r0   rv  dataclassesr   typingr   r   r   r   r   r1   r	   configuration_utilsr   
generationr   r   r   r   modeling_outputsr   modeling_utilsr   utilsr   r   configuration_ragr   retrieval_ragr   
get_loggerr,   loggerr   r9   r=   rn   r   r=  __all__r+   r6   r7   <module>r     s4      ! 9 9   3 f f + - , ( ' 
		H	% [O{ [O [O| VO VO VOr Qo QoQoh X
! X
 X
v 
m1 m
m` 
[. [
[| br6   