Unverified Commit c754c41c authored by Ola Piktus's avatar Ola Piktus Committed by GitHub
Browse files

RAG (#6813)

* added rag WIP

* path fix

* Formatting / renaming prior to actual work

* added rag WIP

* path fix

* Formatting / renaming prior to actual work

* added rag WIP

* path fix

* Formatting / renaming prior to actual work

* added rag WIP

* Formatting / renaming prior to actual work

* First commit

* improve comments

* Retrieval evaluation scripts

* refactor to include modeling outputs + MPI retriever

* Fix rag-token model + refactor

* Various fixes + finetuning logic

* use_bos fix

* Retrieval refactor

* Finetuning refactoring and cleanup

* Add documentation and cleanup

* Remove set_up_rag_env.sh file

* Fix retrieval wit HF index

* Fix import errors

* Fix quality errors

* Refactor as per suggestions in https://github.com/huggingface/transformers/pull/6813#issuecomment-687208867



* fix quality

* Fix RAG Sequence generation

* minor cleanup plus initial tests

* fix test

* fix tests 2

* Comments fix

* post-merge fixes

* Improve readme + post-rebase refactor

* Extra dependencied for tests

* Fix tests

* Fix tests 2

* Refactor test requirements

* Fix tests 3

* Post-rebase refactor

* rename nlp->datasets

* RAG integration tests

* add tokenizer to slow integration test and allow retriever to run on cpu

* add tests; fix position ids warning

* change structure

* change structure

* add from encoder generator

* save working solution

* make all integration tests pass

* add RagTokenizer.save/from_pretrained and RagRetriever.save/from_pretrained

* don't save paths

* delete unnecessary imports

* pass config to AutoTokenizer.from_pretrained for Rag tokenizers

* init wiki_dpr only once

* hardcode legacy index and passages paths (todo: add the right urls)

* finalize config

* finalize retriver api and config api

* LegacyIndex index download refactor

* add dpr to autotokenizer

* make from pretrained more flexible

* fix ragfortokengeneration

* small name changes in tokenizer

* add labels to models

* change default index name

* add retrieval tests

* finish token generate

* align test with previous version and make all tests pass

* add tests

* finalize tests

* implement thoms suggestions

* add first version of test

* make first tests work

* make retriever platform agnostic

* naming

* style

* add legacy index URL

* docstrings + simple retrieval test for distributed

* clean model api

* add doc_ids to retriever's outputs

* fix retrieval tests

* finish model outputs

* finalize model api

* fix generate problem for rag

* fix generate for other modles

* fix some tests

* save intermediate

* set generate to default

* big refactor generate

* delete rag_api

* correct pip faiss install

* fix auto tokenization test

* fix faiss install

* fix test

* move the distributed logic to examples

* model page

* docs

* finish tests

* fix dependencies

* fix import in __init__

* Refactor eval_rag and finetune scripts

* start docstring

* add psutil to test

* fix tf test

* move require torch to top

* fix retrieval test

* align naming

* finish automodel

* fix repo consistency

* test ragtokenizer save/load

* add rag model output docs

* fix ragtokenizer save/load from pretrained

* fix tokenizer dir

* remove torch in retrieval

* fix docs

* fixe finetune scripts

* finish model docs

* finish docs

* remove auto model for now

* add require torch

* remove solved todos

* integrate sylvains suggestions

* sams comments

* correct mistake on purpose

* improve README

* Add generation test cases

* fix rag token

* clean token generate

* fix test

* add note to test

* fix attention mask

* add t5 test for rag

* Fix handling prefix in finetune.py

* don't overwrite index_name
Co-authored-by: default avatarPatrick Lewis <plewis@fb.com>
Co-authored-by: default avatarAleksandra Piktus <piktus@devfair0141.h2.fair>
Co-authored-by: default avatarAleksandra Piktus <piktus@learnfair5102.h2.fair>
Co-authored-by: default avatarAleksandra Piktus <piktus@learnfair5067.h2.fair>
Co-authored-by: default avatarYour Name <you@example.com>
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: default avatarQuentin Lhoest <lhoest.q@gmail.com>
parent 1ee2194f
......@@ -89,7 +89,8 @@ extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
extras["all"] = extras["serving"] + ["tensorflow", "torch"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil", "parameterized"]
extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
extras["quality"] = ["black >= 20.8b1", "isort >= 5", "flake8 >= 3.8.3"]
......
......@@ -42,6 +42,7 @@ from .configuration_mmbt import MMBTConfig
from .configuration_mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
from .configuration_pegasus import PegasusConfig
from .configuration_rag import RagConfig
from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
......@@ -86,6 +87,7 @@ from .file_utils import (
cached_path,
is_apex_available,
is_datasets_available,
is_faiss_available,
is_psutil_available,
is_py3nvml_available,
is_tf_available,
......@@ -140,6 +142,9 @@ from .pipelines import (
pipeline,
)
# Retriever
from .retrieval_rag import RagRetriever
# Tokenizers
from .tokenization_albert import AlbertTokenizer
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
......@@ -172,6 +177,7 @@ from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFas
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .tokenization_pegasus import PegasusTokenizer
from .tokenization_phobert import PhobertTokenizer
from .tokenization_rag import RagTokenizer
from .tokenization_reformer import ReformerTokenizer
from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
......@@ -416,6 +422,7 @@ if is_torch_available():
load_tf_weights_in_openai_gpt,
)
from .modeling_pegasus import PegasusForConditionalGeneration
from .modeling_rag import RagModel, RagSequenceForGeneration, RagTokenForGeneration
from .modeling_reformer import (
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
ReformerAttention,
......
......@@ -24,6 +24,7 @@ from .configuration_bert_generation import BertGenerationConfig
from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
from .configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
from .configuration_encoder_decoder import EncoderDecoderConfig
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
......@@ -38,6 +39,7 @@ from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfi
from .configuration_mobilebert import MobileBertConfig
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
from .configuration_pegasus import PegasusConfig
from .configuration_rag import RagConfig
from .configuration_reformer import ReformerConfig
from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
......@@ -75,6 +77,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP,
LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items()
)
......@@ -110,7 +113,9 @@ CONFIG_MAPPING = OrderedDict(
("encoder-decoder", EncoderDecoderConfig),
("funnel", FunnelConfig),
("lxmert", LxmertConfig),
("dpr", DPRConfig),
("layoutlm", LayoutLMConfig),
("rag", RagConfig),
]
)
......@@ -145,6 +150,8 @@ MODEL_NAMES_MAPPING = OrderedDict(
("funnel", "Funnel Transformer"),
("lxmert", "LXMERT"),
("layoutlm", "LayoutLM"),
("dpr", "DPR"),
("rag", "RAG"),
]
)
......
......@@ -14,7 +14,7 @@
# limitations under the License.
""" DPR model configuration """
from .configuration_bert import BertConfig
from .configuration_utils import PretrainedConfig
from .utils import logging
......@@ -27,7 +27,7 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
}
class DPRConfig(BertConfig):
class DPRConfig(PretrainedConfig):
r"""
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
`DPRModel`.
......@@ -36,12 +36,73 @@ class DPRConfig(BertConfig):
It is used to instantiate the components of the DPR model.
Args:
projection_dim (:obj:`int`, optional, defaults to 0):
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the DPR model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler.
If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
projection_dim (:obj:`int`, `optional`, defaults to 0):
Dimension of the projection for the context and question encoders.
If it is set to zero (default), then no projection is done.
"""
model_type = "dpr"
def __init__(self, projection_dim: int = 0, **kwargs): # projection of the encoders, 0 for no projection
super().__init__(**kwargs)
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
gradient_checkpointing=False,
projection_dim: int = 0,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.gradient_checkpointing = gradient_checkpointing
self.projection_dim = projection_dim
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" RAG model configuration """
import copy
from .configuration_utils import PretrainedConfig
from .file_utils import add_start_docstrings
RAG_CONFIG_DOC = r"""
:class:`~transformers.RagConfig` stores the configuration of a `RagModel`.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
title_sep (:obj:`str`, `optional`, defaults to ``" / "``):
Separator inserted between the title and the text of the retrieved document when calling :class:`~transformers.RagRetriever`.
doc_sep (:obj:`str`, `optional`, defaults to ``" // "``):
Separator inserted between the the text of the retrieved document and the original input when calliang :class:`~transformers.RagRetriever`.
n_docs (:obj:`int`, `optional`, defaults to 5):
Number of documents to retrieve.
max_combined_length (:obj:`int`, `optional`, defaults to 300):
Max length of contextualized input returned by :meth:`~transformers.RagRetriever.__call__`.
retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index excapsulated :class:`~transformers.RagRetriever`.
dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids using :obj:`datasets.list_datasets()`).
dataset_split (:obj:`str`, `optional`, defaults to :obj:`train`)
Which split of the ``dataset`` to load.
index_name (:obj:`str`, `optional`, defaults to :obj:`compressed`)
The index_name of the index associated with the :obj:`dataset`. One can choose between :obj:`legacy`, :obj:`exact` and :obj:`compressed`.
index_path (:obj:`str`, `optional`)
The path to the serialized faiss index on disk.
passages_path: (:obj:`str`, `optional`):
A path to text passages compatible with the faiss index. Required if using :class:`~transformers.retrieval_rag.LegacyIndex`
use_dummy_dataset (:obj:`bool`, `optional`, defaults to ``False``)
Whether to load a "dummy" variant of the dataset specified by :obj:`dataset`.
label_smoothing (:obj:`float`, `optional`, defaults to 0.0):
Only relevant if ``return_loss`` is set to :obj:`True`. Controls the ``epsilon`` parameter value for label smoothing in the loss calculation.
If set to ``0.0``, no label smoothing is performed.
do_marginalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, the logits are marginalized over all documents
by making use of ``torch.nn.functional.log_softmax``.
reduce_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, the NLL loss is reduced using the ``torch.Tensor.sum`` operation.
do_deduplication (:obj:`bool`, `optional`, defaults to :obj:`True`):
Controls whether we want to deduplicate the generations from different context documents for a given input.
Has to be set to :obj:`False` if used while training with distributed backend.
exclude_bos_score (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, the score of the BOS token is disregarded when computing
the loss.
output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and :obj:`context_attention_mask` are returned. See returned tensors for more detail.
"""
@add_start_docstrings(RAG_CONFIG_DOC)
class RagConfig(PretrainedConfig):
model_type = "rag"
def __init__(
self,
vocab_size=None,
is_encoder_decoder=True,
prefix=None,
bos_token_id=None,
pad_token_id=None,
eos_token_id=None,
decoder_start_token_id=None,
title_sep=" / ",
doc_sep=" // ",
n_docs=5,
max_combined_length=300,
retrieval_vector_size=768,
retrieval_batch_size=8,
dataset="wiki_dpr",
dataset_split="train",
index_name="compressed",
index_path=None,
passages_path=None,
use_dummy_dataset=False,
reduce_loss=False,
label_smoothing=0.0,
do_deduplication=True,
exclude_bos_score=False,
do_marginalize=False,
output_retrieved=False,
**kwargs
):
super().__init__(
bos_token_id=bos_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
is_encoder_decoder=is_encoder_decoder,
prefix=prefix,
vocab_size=vocab_size,
**kwargs,
)
assert (
"question_encoder" in kwargs and "generator" in kwargs
), "Config has to be initialized with question_encoder and generator config"
question_encoder_config = kwargs.pop("question_encoder")
question_encoder_model_type = question_encoder_config.pop("model_type")
decoder_config = kwargs.pop("generator")
decoder_model_type = decoder_config.pop("model_type")
from .configuration_auto import AutoConfig
self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config)
self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config)
self.reduce_loss = reduce_loss
self.label_smoothing = label_smoothing
self.exclude_bos_score = exclude_bos_score
self.do_marginalize = do_marginalize
self.title_sep = title_sep
self.doc_sep = doc_sep
self.n_docs = n_docs
self.max_combined_length = max_combined_length
self.dataset = dataset
self.dataset_split = dataset_split
self.index_name = index_name
self.retrieval_vector_size = retrieval_vector_size
self.retrieval_batch_size = retrieval_batch_size
self.passages_path = passages_path
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self.output_retrieved = output_retrieved
self.do_deduplication = do_deduplication
@classmethod
def from_question_encoder_generator_configs(
cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
r"""
Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model configuration and decoder model configuration.
Returns:
:class:`EncoderDecoderConfig`: An instance of a configuration object
"""
return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default :meth:`~transformers.PretrainedConfig.to_dict`.
Returns:
:obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["question_encoder"] = self.question_encoder.to_dict()
output["generator"] = self.generator.to_dict()
output["model_type"] = self.__class__.model_type
return output
......@@ -69,6 +69,7 @@ try:
import datasets # noqa: F401
_datasets_available = True
logger.debug(f"Succesfully imported datasets version {datasets.__version__}")
except ImportError:
_datasets_available = False
......@@ -119,6 +120,16 @@ try:
except ImportError:
_has_apex = False
try:
import faiss # noqa: F401
_faiss_available = True
logger.debug(f"Succesfully imported faiss version {faiss.__version__}")
except ImportError:
_faiss_available = False
default_cache_path = os.path.join(torch_cache_home, "transformers")
......@@ -175,6 +186,10 @@ def is_apex_available():
return _has_apex
def is_faiss_available():
return _faiss_available
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
......
......@@ -27,6 +27,7 @@ from .configuration_auto import (
CamembertConfig,
CTRLConfig,
DistilBertConfig,
DPRConfig,
ElectraConfig,
EncoderDecoderConfig,
FlaubertConfig,
......@@ -97,6 +98,7 @@ from .modeling_distilbert import (
DistilBertForTokenClassification,
DistilBertModel,
)
from .modeling_dpr import DPRQuestionEncoder
from .modeling_electra import (
ElectraForMaskedLM,
ElectraForMultipleChoice,
......@@ -148,6 +150,11 @@ from .modeling_mobilebert import (
)
from .modeling_openai import OpenAIGPTLMHeadModel, OpenAIGPTModel
from .modeling_pegasus import PegasusForConditionalGeneration
from .modeling_rag import ( # noqa: F401 - need to import all RagModels to be in globals() function
RagModel,
RagSequenceForGeneration,
RagTokenForGeneration,
)
from .modeling_reformer import (
ReformerForMaskedLM,
ReformerForQuestionAnswering,
......@@ -224,6 +231,7 @@ MODEL_MAPPING = OrderedDict(
(FunnelConfig, FunnelModel),
(LxmertConfig, LxmertModel),
(BertGenerationConfig, BertGenerationEncoder),
(DPRConfig, DPRQuestionEncoder),
]
)
......@@ -412,7 +420,6 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
]
)
AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
The model class to instantiate is selected based on the :obj:`model_type` property of the config object
......
......@@ -272,6 +272,7 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "ctx_encoder"
authorized_missing_keys = [r"position_ids"]
def init_weights(self):
self.ctx_encoder.init_weights()
......@@ -285,6 +286,7 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "question_encoder"
authorized_missing_keys = [r"position_ids"]
def init_weights(self):
self.question_encoder.init_weights()
......@@ -298,6 +300,7 @@ class DPRPretrainedReader(PreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "span_predictor"
authorized_missing_keys = [r"position_ids"]
def init_weights(self):
self.span_predictor.encoder.init_weights()
......
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAG model implementation."""
from dataclasses import dataclass
from typing import List, Optional, Tuple
import torch
from .configuration_rag import RagConfig
from .configuration_utils import PretrainedConfig
from .file_utils import add_start_docstrings_to_callable, replace_return_docstrings
from .modeling_outputs import ModelOutput
from .modeling_utils import PreTrainedModel
from .retrieval_rag import RagRetriever
from .utils import logging
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "RagConfig"
@dataclass
class RetrievAugLMMarginOutput(ModelOutput):
"""
Base class for retriever augmented marginalized models outputs.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Languaged modeling loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head.
The score is possibly marginalized over all documents for each vocabulary token.
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs
(see :obj:`retrieved_doc_embeds`) and :obj:`question_encoder_last_hidden_state`.
past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks)
of the decoder that can be used (see ``past_key_values`` input) to
speed up sequential decoding.
retrieved_doc_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
Embedded documents retrieved by the retriever.
Is used with ``question_encoder_last_hidden_state`` to compute
the ``doc_scores``.
retrieved_doc_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
The indexes of the embedded documents retrieved by the retriever.
context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Input ids post-processed from the retrieved documents
and the question encoder input_ids by the retriever.
context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Attention mask post-processed from the retrieved documents
and the question encoder input_ids by the retriever.
question_encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer
of the question encoder pooled output of the model.
question_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings
+ one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the question encoder at the output of each layer plus the initial embedding outputs.
question_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
generator_enc_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the generator encoder at the output of each layer plus the initial embedding outputs.
generator_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
generator_dec_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
doc_scores: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
retrieved_doc_embeds: Optional[torch.FloatTensor] = None
retrieved_doc_ids: Optional[torch.LongTensor] = None
context_input_ids: Optional[torch.LongTensor] = None
context_attention_mask: Optional[torch.LongTensor] = None
question_encoder_last_hidden_state: Optional[torch.FloatTensor] = None
question_enc_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
question_enc_attentions: Optional[Tuple[torch.FloatTensor]] = None
generator_enc_last_hidden_state: Optional[torch.FloatTensor] = None
generator_enc_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
generator_enc_attentions: Optional[Tuple[torch.FloatTensor]] = None
generator_dec_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
generator_dec_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class RetrievAugLMOutput(ModelOutput):
"""
Args:
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head.
The score is possibly marginalized over all documents for each vocabulary token.
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and :obj:`question_encoder_last_hidden_state`.
past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,
with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks)
of the decoder that can be used (see ``past_key_values`` input) to
speed up sequential decoding.
retrieved_doc_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs, hidden_size)`, `optional`, returned when `output_retrieved=True`):
Embedded documents retrieved by the retriever.
Is used with ``question_encoder_last_hidden_state`` to compute the ``doc_scores``.
retrieved_doc_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, config.n_docs)`, `optional`, returned when `output_retrieved=True`):
The indexes of the embedded documents retrieved by the retriever.
context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Input ids post-processed from the retrieved documents
and the question encoder input_ids by the retriever.
context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Attention mask post-processed from the retrieved
documents and the question encoder input_ids by the retriever.
question_encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer
of the question encoder pooled output of the model.
question_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the question encoder at the output of each
layer plus the initial embedding outputs.
question_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the question encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
generator_enc_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the generator encoder at the output
of each layer plus the initial embedding outputs.
generator_enc_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
generator_dec_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
logits: torch.FloatTensor = None
doc_scores: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
retrieved_doc_embeds: Optional[torch.FloatTensor] = None
retrieved_doc_ids: Optional[torch.LongTensor] = None
context_input_ids: Optional[torch.LongTensor] = None
context_attention_mask: Optional[torch.LongTensor] = None
question_encoder_last_hidden_state: Optional[torch.FloatTensor] = None
question_enc_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
question_enc_attentions: Optional[Tuple[torch.FloatTensor]] = None
generator_enc_last_hidden_state: Optional[torch.FloatTensor] = None
generator_enc_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
generator_enc_attentions: Optional[Tuple[torch.FloatTensor]] = None
generator_dec_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
generator_dec_attentions: Optional[Tuple[torch.FloatTensor]] = None
class RagPreTrainedModel(PreTrainedModel):
r"""
RAG models were released with the paper `Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks <https://arxiv.org/abs/2005.11401>`_ by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.
RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
"""
config_class = RagConfig
base_model_prefix = "rag"
authorized_missing_keys = [r"position_ids"]
@classmethod
def from_pretrained_question_encoder_generator(
cls,
question_encoder_pretrained_model_name_or_path: str = None,
generator_pretrained_model_name_or_path: str = None,
retriever: RagRetriever = None,
*model_args,
**kwargs
) -> PreTrainedModel:
r"""Instantiates an question_encoder and a generator from one or two base classes of the library from pre-trained model checkpoints.
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
To train the model, you need to first set it back in training mode with `model.train()`.
Params:
question_encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
information necessary to initiate the question_encoder. Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/question_encoder``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
generator_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
information necessary to initiate the generator. Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/generator``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
retriever: (`optional`, ``RagRetriever``) An instance of a :class:`~transformers.RagRetriever` to use as a retriever.
kwargs: (`optional`) Remaining dictionary of keyword arguments.
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attentions=True``).
- To update the question_encoder configuration, use the prefix `question_encoder_` for each configuration parameter
- To update the generator configuration, use the prefix `generator_` for each configuration parameter
- To update the parent model configuration, do not use a prefix for each configuration parameter
Behave differently depending on whether a :obj:`config` is provided or automatically loaded.
Example::
>>> from transformers import RagModel
>>> # initialize a RAG from two pretrained models.
>>> model = RagModel.from_question_encoder_generator_pretrained('facebook/dpr-question_encoder-single-nq-base', 't5-small')
>>> # saving model after fine-tuning
>>> model.save_pretrained("./rag")
>>> # load fine-tuned model
>>> model = RagModel.from_pretrained("./rag")
"""
kwargs_question_encoder = {
argument[len("question_question_encoder_") :]: value
for argument, value in kwargs.items()
if argument.startswith("question_encoder_")
}
kwargs_generator = {
argument[len("generator_") :]: value
for argument, value in kwargs.items()
if argument.startswith("generator_")
}
# remove question_encoder, generator kwargs from kwargs
for key in kwargs_question_encoder.keys():
del kwargs["question_encoder_" + key]
for key in kwargs_generator.keys():
del kwargs["generator_" + key]
# Load and initialize the question_encoder and generator
# The distinction between question_encoder and generator at the model level is made
# by the value of the flag `is_generator` that we need to set correctly.
question_encoder = kwargs_question_encoder.pop("model", None)
if question_encoder is None:
assert (
question_encoder_pretrained_model_name_or_path is not None
), "If `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined"
from .modeling_auto import AutoModel
if "config" not in kwargs_question_encoder:
from .configuration_auto import AutoConfig
question_encoder_config = AutoConfig.from_pretrained(question_encoder_pretrained_model_name_or_path)
kwargs_question_encoder["config"] = question_encoder_config
question_encoder = AutoModel.from_pretrained(
question_encoder_pretrained_model_name_or_path, *model_args, **kwargs_question_encoder
)
generator = kwargs_generator.pop("model", None)
if generator is None:
assert (
generator_pretrained_model_name_or_path is not None
), "If `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be defined"
from .modeling_auto import AutoModelForSeq2SeqLM
if "config" not in kwargs_generator:
from .configuration_auto import AutoConfig
generator_config = AutoConfig.from_pretrained(generator_pretrained_model_name_or_path)
kwargs_generator["config"] = generator_config
generator = AutoModelForSeq2SeqLM.from_pretrained(
generator_pretrained_model_name_or_path, **kwargs_generator
)
# instantiate config with corresponding kwargs
config = kwargs.get("config", None)
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
return cls(question_encoder=question_encoder, generator=generator, config=config, retriever=retriever)
RAG_START_DOCSTRING = r"""
RAG is a seq2seq model which encapsulates two core components: a question encoder and a generator.
During a forward pass, we encode the input with the question encoder and pass it
to the retriever to extract relevant context documents. The documents are then prepended to the input.
Such contextualized inputs is passed to the generator.
The question encoder can be any `autoencoding` model, preferably :obj:`~transformers.DPRQuestionEncoder`, and the generator can be any `seq2seq` model, preferably :obj:`~transformers.BartForConditionalGeneration`.
The model can be initialized with a :obj:`~transformers.RagRetriever` for end-to-end generation or used in combination with the outputs of a retriever in multiple steps - see examples for more details.
The model is compatible any `autoencoding` model as the ``question_encoder`` and any `seq2seq` model with language model head as the ``generator``.
The model has been tested with :class:`~transformers.DPRQuestionEncoder` as the ``question_encoder`` and :class:`~transformers.BartForConditionalGeneration` or :class:`~transformers.T5ForConditionalGeneration` as the ``generator``.
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior.
Args:
config (:class:`~transformers.RagConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
question_encoder (:class:`transformers.PreTrainedModel`):
An encoder model compatible with the faiss index encapsulated by the ``retriever``.
generator (:class:`transformers.PreTrainedModel`):
A seq2seq model used as the generator in the RAG architecture.
retriever (:class:`~transformers.RagRetriever`):
A retriever class encapsulating a faiss index queried to obtain context documents for current inputs.
"""
RAG_FORWARD_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
:class:`~transformers.RagConfig`, used to initialize the model, specifies which generator to use, it also specifies a compatible
generator tokenizer. Use that tokenizer class to obtain the indices.
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices in input_ids.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`)
Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
`last_hidden_state` of shape :obj:`(batch_size, n_docs * sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the encoder.
`doc_scores` of shape :obj:`(batch_size, n_docs)` store retrieval scores of documents retrieved for each input in the batch.
Used by the (:class:`~transformers.RagTokenForGeneration`) model during decoding.
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
Provide for generation tasks. `None` by default, constuct as per instructions for the generator model you're using with your RAG instance.
Provide for generation tasks. `None` by default, constuct as per instructions for the generator model you're using with your RAG instance.
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`):
Tuple consists of two elements: :obj:`encoder_outputs` of the RAG model (see :obj:`encoder_outputs`) and :obj:`past_key_values` of the underlying generator.
Can be used to speed up decoding. :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`)
model during decoding.
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and :obj:`question_encoder_last_hidden_state`.
If the model has is not initialized with a ``retriever`` :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more information.
context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
If the model has is not initialized with a ``retriever`` :obj:`context_input_ids` has to be provided to the forward pass. :obj:`context_input_ids` are returned by :meth:`~transformers.RagRetriever.__call__`
context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Attention mask post-processed from the retrieved documents and the question encoder input_ids by the retriever.
If the model has is not initialized with a ``retriever`` :obj:`context_attention_mask` has to be provided to the forward pass. :obj:`context_attention_mask` are returned by :meth:`~transformers.RagRetriever.__call__`
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
If `use_cache` is True, ``past_key_values`` are returned and can be used to speed up decoding (see
``past_key_values``).
output_attentions (:obj:`bool`, `optional`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
output_retrieved(:obj:`bool`, `optional`):
If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and :obj:`context_attention_mask` are returned. See returned tensors for more detail.
"""
@add_start_docstrings_to_callable(RAG_START_DOCSTRING)
class RagModel(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional = None, # or maybe just use a `set_retriever(...)` method
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an question_encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
else:
assert isinstance(config, self.config_class), "config: {} has to be of type {}".format(
config, self.config_class
)
super().__init__(config)
if question_encoder is None:
from .modeling_auto import AutoModel
question_encoder = AutoModel.from_config(config.question_encoder)
if generator is None:
from .modeling_auto import AutoModelForSeq2SeqLM
generator = AutoModelForSeq2SeqLM.from_config(config.generator)
self.retriever = retriever
if self.retriever is not None:
assert isinstance(
retriever, RagRetriever
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
self.retriever = retriever
self.question_encoder = question_encoder
self.generator = generator
@add_start_docstrings_to_callable(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=RetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_outputs=None,
decoder_input_ids=None,
decoder_attention_mask=None,
past_key_values=None,
doc_scores=None,
context_input_ids=None,
context_attention_mask=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
output_retrieved=None,
):
r"""
Returns:
Example::
>>> from transformers import RagTokenizer, RagRetriever, RagModel
>>> import torch
>>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
>>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
>>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = input_dict["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
"""
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_retrieved = output_retrieved if output_retrieved is not None else self.config.output_retrieved
# whether retriever has to be used
has_to_retrieve = (
self.retriever is not None
and (context_input_ids is None or context_attention_mask is None or doc_scores is None)
and encoder_outputs is None
)
# encoder_outputs are pre-computed during RAG-token generation
if encoder_outputs is None:
if has_to_retrieve:
question_enc_outputs = self.question_encoder(
input_ids, attention_mask=attention_mask, return_dict=True
)
question_encoder_last_hidden_state = question_enc_outputs[0] # hidden states of question encoder
retriever_outputs = self.retriever(
input_ids,
question_encoder_last_hidden_state.cpu().detach().to(torch.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=self.config.n_docs,
return_tensors="pt",
)
context_input_ids, context_attention_mask, retrieved_doc_embeds, retrieved_doc_ids = (
retriever_outputs["context_input_ids"],
retriever_outputs["context_attention_mask"],
retriever_outputs["retrieved_doc_embeds"],
retriever_outputs["doc_ids"],
)
# set to correct device
retrieved_doc_embeds = retrieved_doc_embeds.to(question_encoder_last_hidden_state)
context_input_ids = context_input_ids.to(input_ids)
context_attention_mask = context_attention_mask.to(input_ids)
# compute doc_scores
doc_scores = torch.bmm(
question_encoder_last_hidden_state.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)
).squeeze(1)
else:
assert (
context_input_ids is not None
), "Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function."
assert (
context_attention_mask is not None
), "Make sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function."
assert (
doc_scores is not None
), "Make sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function."
assert (
doc_scores is not None
), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
# Decoder input without context documents
if decoder_input_ids is not None:
decoder_input_ids = decoder_input_ids.repeat_interleave(self.config.n_docs, dim=0)
if decoder_attention_mask is not None:
decoder_attention_mask = decoder_attention_mask.repeat_interleave(self.config.n_docs, dim=0)
gen_outputs = self.generator(
input_ids=context_input_ids,
attention_mask=context_attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
return_dict=True,
)
if not has_to_retrieve:
question_encoder_last_hidden_state = None
question_enc_hidden_states = None
question_enc_attentions = None
retrieved_doc_embeds = None
retrieved_doc_ids = None
else:
question_enc_hidden_states = question_enc_outputs.hidden_states
question_enc_attentions = question_enc_outputs.attentions
if not has_to_retrieve or not output_retrieved:
# don't output retrieved docs
context_input_ids = (None,)
context_attention_mask = None
retrieved_doc_embeds = None
retrieved_doc_ids = None
return RetrievAugLMOutput(
logits=gen_outputs.logits,
doc_scores=doc_scores,
past_key_values=gen_outputs.past_key_values,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
retrieved_doc_embeds=retrieved_doc_embeds,
retrieved_doc_ids=retrieved_doc_ids,
question_encoder_last_hidden_state=question_encoder_last_hidden_state,
question_enc_hidden_states=question_enc_hidden_states,
question_enc_attentions=question_enc_attentions,
generator_enc_last_hidden_state=gen_outputs.encoder_last_hidden_state,
generator_enc_hidden_states=gen_outputs.encoder_hidden_states,
generator_enc_attentions=gen_outputs.encoder_attentions,
generator_dec_hidden_states=gen_outputs.decoder_hidden_states,
generator_dec_attentions=gen_outputs.decoder_attentions,
)
@add_start_docstrings_to_callable(
"""A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass.
""",
RAG_START_DOCSTRING,
)
class RagSequenceForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_encoder_generator_configs(question_encoder.config, generator.config, **kwargs)
super().__init__(config)
# instantiate model
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)
def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever
@add_start_docstrings_to_callable(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=RetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_outputs=None,
decoder_input_ids=None,
decoder_attention_mask=None,
past_key_values=None,
context_input_ids=None,
context_attention_mask=None,
doc_scores=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
output_retrieved=None,
exclude_bos_score=None,
reduce_loss=None,
labels=None,
**kwargs # needs kwargs for generation
):
r"""
exclude_bos_score (:obj:`bool`, `optional`):
Only relevant if ``labels`` is passed.
If :obj:`True`, the score of the BOS token is disregarded when computing
the loss.
reduce_loss (:obj:`bool`, `optional`):
Only relevant if ``labels`` is passed.
If :obj:`True`, the NLL loss is reduced using the ``torch.Tensor.sum`` operation.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Legacy dictionary, which is required so that model can use `generate()` function.
Returns:
Example::
>>> from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
>>> import torch
>>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
>>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
>>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = input_dict["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
>>> # or use retriever seperately
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
>>> # or directly generate
>>> generated = model.generate(input_ids=input_dict["input_ids"])
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
"""
exclude_bos_score = exclude_bos_score if exclude_bos_score is not None else self.config.exclude_bos_score
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss
if labels is not None:
if decoder_input_ids is None:
decoder_input_ids = labels
use_cache = False
outputs = self.rag(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
doc_scores=doc_scores,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
output_retrieved=output_retrieved,
)
loss = None
if labels is not None:
loss = self.get_nll(
outputs.logits,
outputs.doc_scores,
decoder_input_ids,
reduce_loss=reduce_loss,
epsilon=self.config.label_smoothing,
exclude_bos_score=exclude_bos_score,
)
return RetrievAugLMMarginOutput(
loss=loss,
logits=outputs.logits,
doc_scores=outputs.doc_scores,
past_key_values=outputs.past_key_values,
context_input_ids=outputs.context_input_ids,
context_attention_mask=outputs.context_attention_mask,
retrieved_doc_embeds=outputs.retrieved_doc_embeds,
retrieved_doc_ids=outputs.retrieved_doc_ids,
question_encoder_last_hidden_state=outputs.question_encoder_last_hidden_state,
question_enc_hidden_states=outputs.question_enc_hidden_states,
question_enc_attentions=outputs.question_enc_attentions,
generator_enc_last_hidden_state=outputs.generator_enc_last_hidden_state,
generator_enc_hidden_states=outputs.generator_enc_hidden_states,
generator_enc_attentions=outputs.generator_enc_attentions,
generator_dec_hidden_states=outputs.generator_dec_hidden_states,
generator_dec_attentions=outputs.generator_dec_attentions,
)
@property
def retriever(self):
return self.rag.retriever
@property
def generator(self):
return self.rag.generator
@property
def question_encoder(self):
return self.rag.question_encoder
@torch.no_grad()
def generate(
self,
input_ids,
context_input_ids=None,
do_deduplication=None, # defaults to True
num_return_sequences=None, # defaults to 1
num_beams=None, # defaults to 1
**kwargs
):
"""
Implements RAG sequence "thorough" decoding.
Read the :meth:`~transformers.PreTrainedModel.generate`` documentation for more information on how to set other generate input parameters.
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then :obj:`context_input_ids` has to be provided.
context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
do_deduplication (:obj:`bool`, `optional`):
Controls whether we want to deduplicate the generations from different context documents for a given input.
Has to be set to :obj:`False` if used while training with distributed backend.
num_return_sequences(:obj:`int`, `optional`, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this is not the value
we pass to the ``generator``'s `:func:`~transformers.PreTrainedModel.generate`` function, where we set ``num_return_sequences``
to `num_beams`.
num_beams (:obj:`int`, `optional`, defaults to 1):
Number of beams for beam search. 1 means no beam search.
kwargs:
Additional kwargs will be passed to :meth:`~transformers.PreTrainedModel.generate``.
Return:
:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
shorter if all batches finished early due to the :obj:`eos_token_id`.
"""
do_deduplication = do_deduplication if do_deduplication is not None else self.config.do_deduplication
num_doc_return_sequences = (
num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
)
num_beams = num_beams if num_beams is not None else self.config.num_beams
# TODO(patrick) - clean up generate here
if self.retriever is not None and context_input_ids is None:
question_hidden_states = self.question_encoder(input_ids)[0]
context_input_ids = self.retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=self.config.n_docs,
return_tensors="pt",
)["context_input_ids"]
# set to correct device
context_input_ids = context_input_ids.to(input_ids)
hypos = []
kwargs["num_beams"] = num_beams
kwargs["num_return_sequences"] = num_return_sequences
kwargs["attention_mask"] = None
for index in range(len(input_ids)):
# first, generate beams from documents:
generator_input_ids = context_input_ids[
index * self.config.n_docs : (index + 1) * self.config.n_docs
] # (n_docs, max_len)
output_sequences = self.generator.generate(
generator_input_ids,
**kwargs,
) # n_docs * n_beam, tgt_len
if do_deduplication:
# do_deduplication, max_output_len
output_sequences = torch.stack(list({str(k.tolist()): k for k in output_sequences}.values()))
# then, run model forwards to get nll scores:
new_input_ids = input_ids[index : index + 1].repeat(len(output_sequences), 1)
outputs = self(new_input_ids, labels=output_sequences, exclude_bos_score=True)
top_cand_inds = (-outputs["loss"]).topk(num_doc_return_sequences)[1]
# add hypothesis
hypos.append(output_sequences[top_cand_inds])
return self._cat_and_pad(hypos, pad_token_id=self.config.generator.pad_token_id)
def get_nll(self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, exclude_bos_score=False):
# shift tokens left
target = torch.cat(
[target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
)
# bos_token_id is None for T5
use_bos = self.config.bos_token_id is not None and target[:, 0].eq(self.config.bos_token_id).all()
def _mask_pads(ll, smooth_obj):
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill_(pad_mask, 0.0)
smooth_obj.masked_fill_(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)
seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // self.config.n_docs, self.config.n_docs, -1, seq_logits.size(-1)
) # batch_size x n_docs x tgt_len x dim
doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
# RAG-sequence marginaliation
first_token_scores = seq_logprobs[:, :, :1, :]
second_token_scores = seq_logprobs[:, :, 1:2, :]
remainder = seq_logprobs[:, :, 2:, :]
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
# calcualate loss
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, self.config.n_docs, 1, 1)
assert target.dim() == rag_logprobs.dim()
ll = rag_logprobs.gather(dim=-1, index=target)
smooth_obj = rag_logprobs.sum(dim=-1, keepdim=True) # total sum of all (normalised) logits
ll, smooth_obj = _mask_pads(ll, smooth_obj)
# sum over tokens, exclude bos while scoring
ll = ll[:, :, 1:].sum(2) if exclude_bos_score and use_bos else ll.sum(2)
smooth_obj = smooth_obj.sum(2)
ll = ll.logsumexp(1) # logsumexp over docs
smooth_obj = smooth_obj.logsumexp(1)
nll_loss = -ll
smooth_loss = -smooth_obj
if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()
eps_i = epsilon / rag_logprobs.size(-1)
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss
@staticmethod
def _cat_and_pad(tensors, pad_token_id):
output = (
tensors[0].new(sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])).fill_(pad_token_id)
)
ind = 0
for t in tensors:
output[ind : ind + t.shape[0], : t.shape[1]] = t
ind += t.shape[0]
return output
@add_start_docstrings_to_callable(
"""A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass.
""",
RAG_START_DOCSTRING,
)
class RagTokenForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."
if config is None:
config = RagConfig.from_encoder_generator_configs(question_encoder.config, generator.config, **kwargs)
super().__init__(config)
# instantiate model
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)
def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever
def adjust_logits_during_generation(self, logits, cur_len, max_length):
return self.rag.generator.adjust_logits_during_generation(logits, cur_len=cur_len, max_length=max_length)
def prepare_inputs_for_generation(
self, decoder_input_ids, past, attention_mask, use_cache, encoder_outputs, doc_scores, **kwargs
):
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"doc_scores": doc_scores,
"context_attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"past_key_values": past,
"use_cache": use_cache,
"do_marginalize": True,
}
@property
def retriever(self):
return self.rag.retriever
@property
def generator(self):
return self.rag.generator
@property
def question_encoder(self):
return self.rag.question_encoder
@staticmethod
def _reorder_cache(past, beam_idx):
"""Reorders cache for generation. BART-inspired but we need to take care of the extra dimension for docs"""
def _reorder_stacked(hidden_states):
n_docs = hidden_states.shape[0] // beam_idx.shape[0]
hidden_states = hidden_states.view(-1, n_docs, *hidden_states.shape[1:])
hidden_states = hidden_states.index_select(0, beam_idx)
return hidden_states.view(-1, *hidden_states.shape[2:])
def _reorder_buffer(attn_cache):
for k, input_buffer_k in attn_cache.items():
if input_buffer_k is not None:
attn_cache[k] = _reorder_stacked(input_buffer_k)
return attn_cache
reordered_past = []
for layer_past in past:
# get the correct batch idx from decoder layer's batch dim for cross and self-attn
layer_past_new = {attn_key: _reorder_buffer(attn_cache) for attn_key, attn_cache in layer_past.items()}
reordered_past.append(layer_past_new)
return reordered_past
def marginalize(self, seq_logits, doc_scores):
# RAG-token marginalization
seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // self.config.n_docs, self.config.n_docs, -1, seq_logits.size(-1)
)
doc_logprobs = torch.log_softmax(doc_scores, dim=1)
log_prob_sum = seq_logprobs + doc_logprobs.unsqueeze(-1).unsqueeze(-1)
return torch.logsumexp(log_prob_sum, dim=1)
@add_start_docstrings_to_callable(RAG_FORWARD_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=RetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_outputs=None,
decoder_input_ids=None,
decoder_attention_mask=None,
past_key_values=None,
context_input_ids=None,
context_attention_mask=None,
doc_scores=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
output_retrieved=None,
do_marginalize=None,
reduce_loss=None,
labels=None,
**kwargs # needs kwargs for generation
):
r"""
do_marginalize (:obj:`bool`, `optional`):
If :obj:`True`, the logits are marginalized over all documents
by making use of ``torch.nn.functional.log_softmax``.
reduce_loss (:obj:`bool`, `optional`):
Only relevant if ``labels`` is passed.
If :obj:`True`, the NLL loss is reduced using the ``torch.Tensor.sum`` operation.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Legacy dictionary, which is required so that model can use `generate()` function.
Returns:
Example::
>>> from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
>>> import torch
>>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
>>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
>>> # initialize with RagRetriever to do everything in one forward call
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
>>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="pt")
>>> input_ids = input_dict["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
>>> # or use retriever seperately
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
>>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0]
>>> # 2. Retrieve
>>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
>>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
>>> # 3. Forward to generator
>>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
>>> # or directly generate
>>> generated = model.generate(input_ids=input_dict["input_ids"])
>>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
"""
do_marginalize = do_marginalize if do_marginalize is not None else self.config.do_marginalize
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss
if labels is not None:
if decoder_input_ids is None:
decoder_input_ids = labels
use_cache = False
outputs = self.rag(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
doc_scores=doc_scores,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
output_retrieved=output_retrieved,
)
loss = None
logits = outputs.logits
if labels is not None:
assert decoder_input_ids is not None
loss = self.get_nll(
outputs.logits,
outputs.doc_scores,
labels,
reduce_loss=reduce_loss,
epsilon=self.config.label_smoothing,
)
if do_marginalize:
logits = self.marginalize(logits, outputs.doc_scores)
return RetrievAugLMMarginOutput(
loss=loss,
logits=logits,
doc_scores=outputs.doc_scores,
past_key_values=outputs.past_key_values,
context_input_ids=outputs.context_input_ids,
context_attention_mask=outputs.context_attention_mask,
retrieved_doc_embeds=outputs.retrieved_doc_embeds,
retrieved_doc_ids=outputs.retrieved_doc_ids,
question_encoder_last_hidden_state=outputs.question_encoder_last_hidden_state,
question_enc_hidden_states=outputs.question_enc_hidden_states,
question_enc_attentions=outputs.question_enc_attentions,
generator_enc_last_hidden_state=outputs.generator_enc_last_hidden_state,
generator_enc_hidden_states=outputs.generator_enc_hidden_states,
generator_enc_attentions=outputs.generator_enc_attentions,
generator_dec_hidden_states=outputs.generator_dec_hidden_states,
generator_dec_attentions=outputs.generator_dec_attentions,
)
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.LongTensor] = None,
context_input_ids=None,
context_attention_mask=None,
doc_scores=None,
max_length=None,
min_length=None,
early_stopping=None,
use_cache=None,
num_beams=None,
bos_token_id=None,
pad_token_id=None,
eos_token_id=None,
length_penalty=None,
no_repeat_ngram_size=None,
bad_words_ids=None,
num_return_sequences=None,
decoder_start_token_id=None,
**kwargs
):
"""
Implements RAG token decoding.
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
The sequence used as a prompt for the generation. If :obj:`input_ids` is not passed, then :obj:`context_input_ids` has to be provided.
context_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
If the model has is not initialized with a ``retriever`` :obj:`context_input_ids` has to be provided to the forward pass. :obj:`context_input_ids` are returned by :meth:`~transformers.RagRetriever.__call__`
context_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size * config.n_docs, config.max_combined_length)`, `optional`, returned when `output_retrieved=True`):
Attention mask post-processed from the retrieved documents and the question encoder input_ids by the retriever.
If the model has is not initialized with a ``retriever`` :obj:`context_attention_mask` has to be provided to the forward pass. :obj:`context_attention_mask` are returned by :meth:`~transformers.RagRetriever.__call__`
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and :obj:`question_encoder_last_hidden_state`.
If the model has is not initialized with a ``retriever`` :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more information.
max_length (:obj:`int`, `optional`, defaults to 20):
The maximum length of the sequence to be generated.
min_length (:obj:`int`, `optional`, defaults to 10):
The minimum length of the sequence to be generated.
early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding.
pad_token_id (:obj:`int`, `optional`):
The id of the `padding` token.
bos_token_id (:obj:`int`, `optional`):
The id of the `beginning-of-sequence` token.
eos_token_id (:obj:`int`, `optional`):
The id of the `end-of-sequence` token.
length_penalty (:obj:`float`, `optional`, defaults to 1.0):
Exponential penalty to the length. 1.0 means no penalty.
Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
order to encourage the model to produce longer sequences.
no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once.
bad_words_ids(:obj:`List[int]`, `optional`):
List of token ids that are not allowed to be generated. In order to get the tokens of the words that
should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
num_beams (:obj:`int`, `optional`, defaults to 1):
Number of beams for beam search. 1 means no beam search.
num_return_sequences(:obj:`int`, `optional`, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this is not the value
we pass to the ``generator``'s `:func:`~transformers.PreTrainedModel.generate`` function, where we set ``num_return_sequences``
to `num_beams`.
decoder_start_token_id (:obj:`int`, `optional`):
If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
Return:
:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
shorter if all batches finished early due to the :obj:`eos_token_id`.
"""
# set default parameters
max_length = max_length if max_length is not None else self.config.max_length
min_length = min_length if min_length is not None else self.config.min_length
early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
use_cache = use_cache if use_cache is not None else self.config.use_cache
num_beams = num_beams if num_beams is not None else self.config.num_beams
bos_token_id = bos_token_id if bos_token_id is not None else self.config.generator.bos_token_id
pad_token_id = pad_token_id if pad_token_id is not None else self.config.generator.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.config.generator.eos_token_id
length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
no_repeat_ngram_size = (
no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
)
bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
num_return_sequences = (
num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
)
decoder_start_token_id = (
decoder_start_token_id
if decoder_start_token_id is not None
else self.config.generator.decoder_start_token_id
)
# batch_size
batch_size = input_ids.shape[0]
# retrieve docs
if self.retriever is not None and context_input_ids is None:
question_hidden_states = self.question_encoder(input_ids)[0]
out = self.retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=self.config.n_docs,
return_tensors="pt",
)
context_input_ids, context_attention_mask, retrieved_doc_embeds = (
out["context_input_ids"],
out["context_attention_mask"],
out["retrieved_doc_embeds"],
)
# set to correct device
retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
context_input_ids = context_input_ids.to(input_ids)
context_attention_mask = context_attention_mask.to(input_ids)
# compute doc_scores
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
1
)
encoder = self.rag.generator.get_encoder()
encoder_outputs = encoder(input_ids=context_input_ids, attention_mask=context_attention_mask, return_dict=True)
decoder_input_ids = torch.full(
(batch_size * num_beams, 1),
decoder_start_token_id,
dtype=torch.long,
device=next(self.parameters()).device,
)
last_hidden_state = encoder_outputs["last_hidden_state"]
def extend_enc_output(tensor, num_beams=None):
# split into `batch_size`, `num_beams`, `num_docs`
tensor = tensor[None, None, :].reshape((batch_size, 1, self.config.n_docs) + tensor.shape[1:])
# repeat same last hidden states over `num_beams` dimension
tensor = tensor.expand((batch_size, num_beams, self.config.n_docs) + tensor.shape[3:])
# merge `batch_size`, `num_beams`, `num_docs` dims again
return tensor.reshape((batch_size * num_beams * self.config.n_docs,) + tensor.shape[3:])
# correctly extend last_hidden_state and attention mask
context_attention_mask = extend_enc_output(context_attention_mask, num_beams=num_beams)
encoder_outputs["last_hidden_state"] = extend_enc_output(last_hidden_state, num_beams=num_beams)
doc_scores = doc_scores.repeat_interleave(num_beams, dim=0)
# define start_len & additional parameters
cur_len = 1
vocab_size = self.config.generator.vocab_size
kwargs["doc_scores"] = doc_scores
kwargs["encoder_outputs"] = encoder_outputs
# not needed. TODO(PVP): change after generate refactor
do_sample = False
temperature = self.config.temperature
top_k = self.config.top_k
top_p = self.config.top_p
repetition_penalty = self.config.repetition_penalty
if num_beams > 1:
return self._generate_beam_search(
decoder_input_ids,
cur_len=cur_len,
max_length=max_length,
min_length=min_length,
do_sample=do_sample,
early_stopping=early_stopping,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
bad_words_ids=bad_words_ids,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
batch_size=batch_size,
num_return_sequences=num_return_sequences,
length_penalty=length_penalty,
num_beams=num_beams,
vocab_size=vocab_size,
attention_mask=context_attention_mask,
use_cache=use_cache,
model_kwargs=kwargs,
)
else:
return self._generate_no_beam_search(
decoder_input_ids,
cur_len=cur_len,
max_length=max_length,
min_length=min_length,
do_sample=do_sample,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
bad_words_ids=bad_words_ids,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
batch_size=batch_size,
attention_mask=context_attention_mask,
use_cache=use_cache,
model_kwargs=kwargs,
)
def get_input_embeddings(self):
return self.rag.generator.get_input_embeddings()
def get_output_embeddings(self):
return self.rag.generator.get_output_embeddings()
def shift_tokens_right(self, input_ids, start_token_id=None):
"""Shift input ids one token to the right, and pad with start_token_id"""
if start_token_id is None:
start_token_id = self.config.decoder_start_token_id
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = start_token_id
return shifted_input_ids
def get_nll(self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0):
# shift tokens left
target = torch.cat(
[target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
)
def _mask_pads(ll, smooth_obj):
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill_(pad_mask, 0.0)
smooth_obj.masked_fill_(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)
rag_logprobs = self.marginalize(seq_logits, doc_scores)
target = target.unsqueeze(-1)
assert target.dim() == rag_logprobs.dim()
ll = rag_logprobs.gather(dim=-1, index=target)
smooth_obj = rag_logprobs.sum(dim=-1, keepdim=True) # total sum of all (normalised) logits
ll, smooth_obj = _mask_pads(ll, smooth_obj)
ll = ll.sum(1) # sum over tokens
smooth_obj = smooth_obj.sum(1)
nll_loss = -ll
smooth_loss = -smooth_obj
if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()
eps_i = epsilon / rag_logprobs.size(-1)
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAG Retriever model implementation."""
import os
import pickle
import time
from typing import Iterable, List, Optional, Tuple
import numpy as np
from .configuration_rag import RagConfig
from .file_utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url
from .tokenization_rag import RagTokenizer
from .tokenization_utils_base import BatchEncoding
from .utils import logging
if is_datasets_available() and is_faiss_available():
from datasets import load_dataset
import faiss
logger = logging.get_logger(__name__)
LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/"
class Index:
"""
A base class for the Indices encapsulated by the :class:`~transformers.RagRetriever`.
"""
def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
"""
Returns a list of dictionaries, containing titles and text of the retrieved documents.
Args:
doc_ids (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
A tensor of document indices.
"""
raise NotImplementedError
def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
"""
For each query in the batch, retrieves ``n_docs`` documents.
Args:
question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size):
An array of query vectors.
n_docs (:obj:`int`):
The number of docs retrieved per query.
Returns:
:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
"""
raise NotImplementedError
def is_initialized(self):
"""
Returns :obj:`True` if index is already initialized.
"""
raise NotImplementedError
def init_index(self):
"""
A function responsible for loading the index into memory. Should be called only once per training run of a RAG model.
E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load the index.
"""
raise NotImplementedError
class LegacyIndex:
"""
An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR.
We use default faiss index parameters as specified in that repository.
Args:
vector_size (:obj:`int`):
The dimension of indexed vectors.
index_path (:obj:`str`):
A path to a `directory` containing index files compatible with
:class:`~transformers.retrieval_rag.LegacyIndex`
"""
INDEX_FILENAME = "hf_bert_base.hnswSQ8_correct_phi_128.c_index"
PASSAGE_FILENAME = "psgs_w100.tsv.pkl"
def __init__(self, vector_size, index_path):
self.index_id_to_db_id = []
self.index_path = index_path
self.passages = self._load_passages()
self.vector_size = vector_size
self.index = None
self._index_initialize = False
def _resolve_path(self, index_path, filename):
assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid ``index_path``."
archive_file = os.path.join(index_path, filename)
try:
# Load from URL or cache if already cached
resolved_archive_file = cached_path(archive_file)
if resolved_archive_file is None:
raise EnvironmentError
except EnvironmentError:
msg = (
f"Can't load '{archive_file}'. Make sure that:\n\n"
f"- '{index_path}' is a correct remote path to a directory containing a file named {filename}"
f"- or '{index_path}' is the correct path to a directory containing a file named {filename}.\n\n"
)
raise EnvironmentError(msg)
if resolved_archive_file == archive_file:
logger.info("loading file {}".format(archive_file))
else:
logger.info("loading file {} from cache at {}".format(archive_file, resolved_archive_file))
return resolved_archive_file
def _load_passages(self):
logger.info("Loading passages from {}".format(self.index_path))
passages_path = self._resolve_path(self.index_path, self.PASSAGE_FILENAME)
with open(passages_path, "rb") as passages_file:
passages = pickle.load(passages_file)
return passages
def _deserialize_index(self):
logger.info("Loading index from {}".format(self.index_path))
resolved_index_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index.dpr")
self.index = faiss.read_index(resolved_index_path)
resolved_meta_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index_meta.dpr")
with open(resolved_meta_path, "rb") as metadata_file:
self.index_id_to_db_id = pickle.load(metadata_file)
assert (
len(self.index_id_to_db_id) == self.index.ntotal
), "Deserialized index_id_to_db_id should match faiss index size"
def is_initialized(self):
return self._index_initialize
def init_index(self):
index = faiss.IndexHNSWFlat(self.vector_size + 1, 512)
index.hnsw.efSearch = 128
index.hnsw.efConstruction = 200
self.index = index
self._deserialize_index()
self._index_initialize = True
def get_doc_dicts(self, doc_ids: np.array):
doc_list = []
for doc_ids_i in doc_ids:
ids = [str(int(doc_id)) for doc_id in doc_ids_i]
docs = [self.passages[doc_id] for doc_id in ids]
doc_list.append(docs)
doc_dicts = []
for docs in doc_list:
doc_dict = {}
doc_dict["title"] = [doc[1] for doc in docs]
doc_dict["text"] = [doc[0] for doc in docs]
doc_dicts.append(doc_dict)
return doc_dicts
def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
aux_dim = np.zeros(len(question_hidden_states), dtype="float32").reshape(-1, 1)
query_nhsw_vectors = np.hstack((question_hidden_states, aux_dim))
_, docs_ids = self.index.search(query_nhsw_vectors, n_docs)
vectors = [[self.index.reconstruct(int(doc_id))[:-1] for doc_id in doc_ids] for doc_ids in docs_ids]
ids = [[int(self.index_id_to_db_id[doc_id]) for doc_id in doc_ids] for doc_ids in docs_ids]
return np.array(ids), np.array(vectors)
class HFIndex:
"""
A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``,
we load the pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from the indicated path on disk.
Args:
dataset (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load.
index_name (:obj:`str`, optional, defaults to ``train``)
The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be saved under this name.
index_path (:obj:`str`, optional, defaults to ``None``)
The path to the serialized faiss index on disk.
"""
def __init__(
self,
dataset_name: str,
dataset_split: str,
index_name: str,
index_path: Optional[str] = None,
use_dummy_dataset=False,
):
super().__init__()
self.dataset_name = dataset_name
self.dataset_split = dataset_split
self.index_name = index_name
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self._index_initialize = False
logger.info("Loading passages from {}".format(self.dataset_name))
self.dataset = load_dataset(
self.dataset_name, with_index=False, split=self.dataset_split, dummy=self.use_dummy_dataset
)
def is_initialized(self):
return self._index_initialize
def init_index(self):
if self.index_path is not None:
logger.info("Loading index from {}".format(self.index_path))
self.index.load_faiss_index(index_name=self.index_name, file=self.index_path)
else:
logger.info("Loading index from {}".format(self.dataset_name + " with index name " + self.index_name))
self.dataset = load_dataset(
self.dataset_name,
with_embeddings=True,
with_index=True,
split=self.dataset_split,
index_name=self.index_name,
dummy=self.use_dummy_dataset,
)
self._index_initialize = True
def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
return [self.dataset[doc_ids[i].tolist()] for i in range(doc_ids.shape[0])]
def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
_, docs = self.dataset.get_nearest_examples_batch("embeddings", question_hidden_states, n_docs)
ids = [[int(i) for i in doc["id"]] for doc in docs]
vectors = [doc["embeddings"] for doc in docs]
return np.array(ids), np.array(vectors) # shapes (batch_size, n_docs) and (batch_size, n_docs, d)
class RagRetriever:
"""
Retriever used to get documents from vector queries.
It retrieves the documents embeddings as well as the documents contents, and it formats them to be used with a RagModel.
Args:
config (:class:`~transformers.RagConfig`):
The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer that was used to tokenize the question.
It is used to decode the question and then use the generator_tokenizer.
generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
The tokenizer used for the generator part of the RagModel.
"""
_init_retrieval = True
def __init__(self, config, question_encoder_tokenizer, generator_tokenizer):
super().__init__()
self.index = (
LegacyIndex(
config.retrieval_vector_size,
config.index_path or LEGACY_INDEX_PATH,
)
if config.index_name == "legacy"
else HFIndex(
config.dataset, config.dataset_split, config.index_name, config.index_path, config.use_dummy_dataset
)
)
self.generator_tokenizer = generator_tokenizer
self.question_encoder_tokenizer = question_encoder_tokenizer
self.n_docs = config.n_docs
self.batch_size = config.retrieval_batch_size
self.config = config
if self._init_retrieval:
self.init_retrieval()
@classmethod
def from_pretrained(cls, retriever_name_or_path, **kwargs):
config = RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
question_encoder_tokenizer = rag_tokenizer.question_encoder
generator_tokenizer = rag_tokenizer.generator
return cls(
config, question_encoder_tokenizer=question_encoder_tokenizer, generator_tokenizer=generator_tokenizer
)
def save_pretrained(self, save_directory):
self.config.save_pretrained(save_directory)
rag_tokenizer = RagTokenizer(
question_encoder_tokenizer=self.question_encoder_tokenizer,
generator_tokenizer=self.generator_tokenizer,
)
rag_tokenizer.save_pretrained(save_directory)
def init_retrieval(self):
"""
Retriever initalization function. It loads the index into memory.
"""
logger.info("initializing retrieval")
self.index.init_index()
def postprocess_docs(self, docs, input_strings, prefix, n_docs, return_tensors=None):
r"""
Postprocessing retrieved ``docs`` and combining them with ``input_strings``.
Args:
doc_scores (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
Retrieval scores of respective docs - passed for logging.
docs (:obj:`dict`):
Retrieved documents.
input_strings (:obj:`str`):
Input strings decoded by ``preprocess_query``.
prefix (:obj:`str`):
Prefix added at the beginning of each input, typically used with T5-based models.
Return:
:obj:`tuple(tensors)`:
a tuple consisting of two elements: contextualized ``input_ids`` and a compatible ``attention_mask``.
"""
def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
# TODO(Patrick): if we train more RAG models, I want to put the input first to take advantage of effortless truncation
# TODO(piktus): better handling of truncation
if doc_title.startswith('"'):
doc_title = doc_title[1:]
if doc_title.endswith('"'):
doc_title = doc_title[:-1]
if prefix is None:
prefix = ""
out = (prefix + doc_title + self.config.title_sep + doc_text + self.config.doc_sep + input_string).replace(
" ", " "
)
return out
rag_input_strings = [
cat_input_and_doc(
docs[i]["title"][j],
docs[i]["text"][j],
input_strings[i],
prefix,
)
for i in range(len(docs))
for j in range(n_docs)
]
contextualized_inputs = self.generator_tokenizer.batch_encode_plus(
rag_input_strings,
max_length=self.config.max_combined_length,
return_tensors=return_tensors,
padding="max_length",
truncation=True,
)
return contextualized_inputs["input_ids"], contextualized_inputs["attention_mask"]
def _chunk_tensor(self, t: Iterable, chunk_size: int) -> List[Iterable]:
return [t[i : i + chunk_size] for i in range(0, len(t), chunk_size)]
def _main_retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, np.ndarray]:
question_hidden_states_batched = self._chunk_tensor(question_hidden_states, self.batch_size)
ids_batched = []
vectors_batched = []
for question_hidden_states in question_hidden_states_batched:
start_time = time.time()
ids, vectors = self.index.get_top_docs(question_hidden_states, n_docs)
logger.debug(
"index search time: {} sec, batch size {}".format(
time.time() - start_time, question_hidden_states.shape
)
)
ids_batched.extend(ids)
vectors_batched.extend(vectors)
return np.array(ids_batched), np.array(
vectors_batched
) # shapes (batch_size, n_docs) and (batch_size, n_docs, d)
def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
"""
Retrieves documents for specified ``question_hidden_states``.
Args:
question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
A batch of query vectors to retrieve with.
n_docs (:obj:`int`):
The number of docs retrieved per query.
Return:
retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
The retrieval embeddings of the retrieved docs per query.
doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
The ids of the documents in the index
doc_dicts (:obj:`List[dict]`):
The retrieved_doc_embeds examples per query.
"""
doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids)
def __call__(
self,
question_input_ids: List[List[int]],
question_hidden_states: np.ndarray,
prefix=None,
n_docs=None,
return_tensors=None,
) -> BatchEncoding:
"""
Retrieves documents for specified :obj:`question_hidden_states`.
Args:
question_input_ids: (:obj:`List[List[int]]`) batch of input ids
question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`:
A batch of query vectors to retrieve with.
prefix: (:obj:`str`, `optional`):
The prefix used by the generator's tokenizer.
n_docs (:obj:`int`, `optional`):
The number of docs retrieved per query.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
Output:
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
- **context_input_ids** -- List of token ids to be fed to a model.
`What are input IDs? <../glossary.html#input-ids>`__
- **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
:obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
`What are attention masks? <../glossary.html#attention-mask>`__
- **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
- **doc_ids** -- List of ids of the retrieved documents
"""
n_docs = n_docs if n_docs is not None else self.n_docs
prefix = prefix if prefix is not None else self.config.generator.prefix
retrieved_doc_embeds, doc_ids, docs = self.retrieve(question_hidden_states, n_docs)
input_strings = self.question_encoder_tokenizer.batch_decode(question_input_ids, skip_special_tokens=True)
context_input_ids, context_attention_mask = self.postprocess_docs(
docs, input_strings, prefix, n_docs, return_tensors=return_tensors
)
return BatchEncoding(
{
"context_input_ids": context_input_ids,
"context_attention_mask": context_attention_mask,
"retrieved_doc_embeds": retrieved_doc_embeds,
"doc_ids": doc_ids,
},
tensor_type=return_tensors,
)
......@@ -10,7 +10,7 @@ from distutils.util import strtobool
from io import StringIO
from pathlib import Path
from .file_utils import _tf_available, _torch_available, _torch_tpu_available
from .file_utils import _datasets_available, _faiss_available, _tf_available, _torch_available, _torch_tpu_available
SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
......@@ -161,6 +161,21 @@ def require_torch_and_cuda(test_case):
return test_case
def require_datasets(test_case):
"""Decorator marking a test that requires datasets."""
if not _datasets_available:
test_case = unittest.skip("test requires Datasets")(test_case)
return test_case
def require_faiss(test_case):
"""Decorator marking a test that requires faiss."""
if not _faiss_available:
test_case = unittest.skip("test requires Faiss")(test_case)
return test_case
def get_tests_dir():
"""
returns the full path to the `tests` dir, so that the tests can be invoked from anywhere
......
......@@ -26,6 +26,7 @@ from .configuration_auto import (
CamembertConfig,
CTRLConfig,
DistilBertConfig,
DPRConfig,
ElectraConfig,
EncoderDecoderConfig,
FlaubertConfig,
......@@ -40,6 +41,7 @@ from .configuration_auto import (
MobileBertConfig,
OpenAIGPTConfig,
PegasusConfig,
RagConfig,
ReformerConfig,
RetriBertConfig,
RobertaConfig,
......@@ -60,6 +62,7 @@ from .tokenization_bertweet import BertweetTokenizer
from .tokenization_camembert import CamembertTokenizer
from .tokenization_ctrl import CTRLTokenizer
from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
from .tokenization_dpr import DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
from .tokenization_flaubert import FlaubertTokenizer
from .tokenization_fsmt import FSMTTokenizer
......@@ -74,6 +77,7 @@ from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFas
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .tokenization_pegasus import PegasusTokenizer
from .tokenization_phobert import PhobertTokenizer
from .tokenization_rag import RagTokenizer
from .tokenization_reformer import ReformerTokenizer
from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
......@@ -110,6 +114,7 @@ TOKENIZER_MAPPING = OrderedDict(
(FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
(LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
(LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)),
(DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)),
(BertConfig, (BertTokenizer, BertTokenizerFast)),
(OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
(GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
......@@ -121,6 +126,7 @@ TOKENIZER_MAPPING = OrderedDict(
(FSMTConfig, (FSMTTokenizer, None)),
(BertGenerationConfig, (BertGenerationTokenizer, None)),
(LayoutLMConfig, (LayoutLMTokenizer, None)),
(RagConfig, (RagTokenizer, None)),
]
)
......
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RAG."""
import os
from typing import List, Optional
from .configuration_rag import RagConfig
from .tokenization_utils_base import BatchEncoding
from .utils import logging
logger = logging.get_logger(__name__)
class RagTokenizer:
def __init__(self, question_encoder, generator):
self.question_encoder = question_encoder
self.generator = generator
def save_pretrained(self, save_directory):
if os.path.isfile(save_directory):
raise ValueError("Provided path ({}) should be a directory, not a file".format(save_directory))
os.makedirs(save_directory, exist_ok=True)
question_encoder_path = os.path.join(save_directory, "question_encoder_tokenizer")
generator_path = os.path.join(save_directory, "generator_tokenizer")
self.question_encoder.save_pretrained(question_encoder_path)
self.generator.save_pretrained(generator_path)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
# dynamically import AutoTokenizer
from .tokenization_auto import AutoTokenizer
config = kwargs.pop("config", None)
if config is None:
config = RagConfig.from_pretrained(pretrained_model_name_or_path)
question_encoder_path = os.path.join(pretrained_model_name_or_path, "question_encoder_tokenizer")
generator_path = os.path.join(pretrained_model_name_or_path, "generator_tokenizer")
question_encoder = AutoTokenizer.from_pretrained(question_encoder_path, config=config.question_encoder)
generator = AutoTokenizer.from_pretrained(generator_path, config=config.generator)
return cls(question_encoder=question_encoder, generator=generator)
def __call__(self, *args, **kwargs):
return self.question_encoder(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
return self.generator.batch_decode(*args, **kwargs)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = "np",
truncation=True,
**kwargs,
) -> BatchEncoding:
r"""
Prepare a batch that can be passed directly to an instance of :class:`~transformers.RagModel`.
Args:
src_texts: (:obj:`List[str]`):
List of documents to summarize or source language texts.
tgt_texts: (:obj:`List[str]`, `optional`):
List of summaries or target language texts.
max_length (:obj:`int`, `optional`):
Controls the maximum length for encoder inputs (documents to summarize or source language texts).
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries).
If left unset or set to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
Activates and controls truncation. Accepts the following values:
* :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
:obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
provided. This will truncate token by token, removing a token from the longest sequence in the pair
if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
* :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
sequence lengths greater than the model maximum admissible input size).
**kwargs:
Additional keyword arguments passed along to :obj:`self.__call__`.
Returns:
:class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
- **input_ids** -- List of token ids to be fed to the encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
- **labels** -- List of token ids for tgt_texts
The full set of keys ``[input_ids, attention_mask, labels]``,
will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
"""
if max_length is None:
max_length = self.question_encoder.model_max_length
model_inputs: BatchEncoding = self.question_encoder(
src_texts,
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
padding=padding,
truncation=truncation,
**kwargs,
)
if tgt_texts is None:
return model_inputs
# Process tgt_texts
if max_target_length is None:
max_target_length = self.generator.model_max_length
labels = self.generator(
tgt_texts,
add_special_tokens=True,
return_tensors=return_tensors,
padding=padding,
max_length=max_target_length,
truncation=truncation,
**kwargs,
)["input_ids"]
model_inputs["labels"] = labels
return model_inputs
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from unittest.mock import patch
import numpy as np
from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from transformers.tokenization_bart import BartTokenizer
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
from transformers.tokenization_t5 import T5Tokenizer
from .test_modeling_bart import ModelTester as BartModelTester
from .test_modeling_dpr import DPRModelTester
from .test_modeling_t5 import T5ModelTester
TOLERANCE = 1e-3
T5_SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
if is_torch_available() and is_datasets_available() and is_faiss_available():
import torch
from datasets import Dataset
import faiss
from transformers import (
AutoConfig,
AutoModel,
AutoModelForSeq2SeqLM,
RagConfig,
RagModel,
RagRetriever,
RagSequenceForGeneration,
RagTokenForGeneration,
)
from transformers.modeling_outputs import BaseModelOutput
def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
"""If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
if a is None and b is None:
return True
try:
if torch.allclose(a, b, atol=atol):
return True
raise
except Exception:
msg = "{} != {}".format(a, b)
if prefix:
msg = prefix + ": " + msg
raise AssertionError(msg)
def require_retrieval(test_case):
"""
Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
:class:`~transformers.RagRetriever`.
These tests are skipped when respective libraries are not installed.
"""
if not (is_torch_available() and is_datasets_available() and is_faiss_available()):
test_case = unittest.skip("test requires PyTorch")(test_case)
return test_case
@require_torch
@require_retrieval
class RagTestMixin:
all_model_classes = (
(RagModel, RagTokenForGeneration, RagSequenceForGeneration)
if is_torch_available() and is_datasets_available() and is_faiss_available()
else ()
)
retrieval_vector_size = 32
n_docs = 2
max_combined_length = 16
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
# DPR tok
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
os.makedirs(dpr_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
# BART tok
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"\u0120",
"\u0120l",
"\u0120n",
"\u0120lo",
"\u0120low",
"er",
"\u0120lowest",
"\u0120newer",
"\u0120wider",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
os.makedirs(bart_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
t5_tokenizer = T5Tokenizer(T5_SAMPLE_VOCAB)
t5_tokenizer_path = os.path.join(self.tmpdirname, "t5_tokenizer")
t5_tokenizer.save_pretrained(t5_tokenizer_path)
@cached_property
def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
@cached_property
def bart_tokenizer(self) -> BartTokenizer:
return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
@cached_property
def t5_tokenizer(self) -> BartTokenizer:
return T5Tokenizer.from_pretrained(os.path.join(self.tmpdirname, "t5_tokenizer"))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def get_retriever(self, config):
dataset = Dataset.from_dict(
{
"id": ["0", "1"],
"text": ["foo", "bar"],
"title": ["Foo", "Bar"],
"embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
}
)
dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
tokenizer = self.bart_tokenizer if config.generator.model_type == "bart" else self.t5_tokenizer
with patch("transformers.retrieval_rag.load_dataset") as mock_load_dataset:
mock_load_dataset.return_value = dataset
retriever = RagRetriever(
config,
question_encoder_tokenizer=self.dpr_tokenizer,
generator_tokenizer=tokenizer,
)
return retriever
def check_model_with_retriever(
self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
):
self.assertIsNotNone(config.question_encoder)
self.assertIsNotNone(config.generator)
for model_class in self.all_model_classes:
model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
model.eval()
self.assertTrue(model.config.is_encoder_decoder)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
)
# logits
self.assertEqual(
outputs.logits.shape,
(self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
)
# generator encoder last hidden states
self.assertEqual(
outputs.generator_enc_last_hidden_state.shape,
(self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
)
# doc scores
self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
def check_model_generate(
self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
):
self.assertIsNotNone(config.question_encoder)
self.assertIsNotNone(config.generator)
for model_class in self.all_model_classes[1:]:
model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
model.eval()
self.assertTrue(model.config.is_encoder_decoder)
outputs = model.generate(
input_ids=input_ids,
num_beams=2,
num_return_sequences=2,
decoder_start_token_id=config.generator.eos_token_id,
)
self.assertIsNotNone(outputs)
def check_model_without_retriever(
self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
):
self.assertIsNotNone(config.question_encoder)
self.assertIsNotNone(config.generator)
retriever = self.get_retriever(config)
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
model.eval()
self.assertTrue(model.config.is_encoder_decoder)
question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
out = retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
prefix=config.generator.prefix,
return_tensors="pt",
)
context_input_ids, context_attention_mask, retrieved_doc_embeds = (
out["context_input_ids"],
out["context_attention_mask"],
out["retrieved_doc_embeds"],
)
# cast
retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
context_input_ids = context_input_ids.to(input_ids)
context_attention_mask = context_attention_mask.to(input_ids)
# compute doc_scores
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
1
)
outputs = model(
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
doc_scores=doc_scores,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
)
# logits
self.assertEqual(
outputs.logits.shape,
(self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
)
# generator encoder last hidden states
self.assertEqual(
outputs.generator_enc_last_hidden_state.shape,
(self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
)
# doc scores
self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
def check_model_with_encoder_outputs(
self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
):
self.assertIsNotNone(config.question_encoder)
self.assertIsNotNone(config.generator)
for model_class in self.all_model_classes:
model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
model.eval()
self.assertTrue(model.config.is_encoder_decoder)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
)
encoder_outputs = BaseModelOutput(outputs.generator_enc_last_hidden_state)
# run only generator
outputs = model(
encoder_outputs=encoder_outputs,
doc_scores=outputs.doc_scores,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
)
# logits
self.assertEqual(
outputs.logits.shape,
(self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
)
# generator encoder last hidden states
self.assertEqual(
outputs.generator_enc_last_hidden_state.shape,
(self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
)
# doc scores
self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
def test_model_with_retriever(self):
inputs_dict = self.config_and_inputs
self.check_model_with_retriever(**inputs_dict)
def test_model_without_retriever(self):
inputs_dict = self.config_and_inputs
self.check_model_without_retriever(**inputs_dict)
def test_model_with_encoder_outputs(self):
inputs_dict = self.config_and_inputs
self.check_model_with_encoder_outputs(**inputs_dict)
def test_model_generate(self):
inputs_dict = self.config_and_inputs
self.check_model_generate(**inputs_dict)
@require_torch
@require_retrieval
class RagDPRBartTest(RagTestMixin, unittest.TestCase):
@cached_property
def config_and_inputs(self):
question_encoder_tester = DPRModelTester(self)
dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
generator_tester = BartModelTester(self)
bart_config_and_inputs = generator_tester.prepare_config_and_inputs_for_common()
(question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
(generator_config, bart_inputs_dict) = bart_config_and_inputs
decoder_input_ids, decoder_attention_mask = bart_inputs_dict["input_ids"], bart_inputs_dict["attention_mask"]
config = RagConfig.from_question_encoder_generator_configs(
question_encoder_config,
generator_config,
n_docs=self.n_docs,
retrieval_vector_size=self.retrieval_vector_size,
max_combined_length=self.max_combined_length,
use_cache=False,
)
return {
"config": config,
"input_ids": input_ids,
"attention_mask": input_mask,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
}
@require_torch
@require_retrieval
class RagDPRT5Test(RagTestMixin, unittest.TestCase):
@cached_property
def config_and_inputs(self):
question_encoder_tester = DPRModelTester(self)
dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
generator_tester = T5ModelTester(self, vocab_size=1100, n_positions=30)
t5_config_and_inputs = generator_tester.prepare_config_and_inputs()
(question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
# import ipdb; ipdb.set_trace()
(generator_config, _, decoder_input_ids, _, decoder_attention_mask, _) = t5_config_and_inputs
config = RagConfig.from_question_encoder_generator_configs(
question_encoder_config,
generator_config,
n_docs=self.n_docs,
retrieval_vector_size=self.retrieval_vector_size,
max_combined_length=self.max_combined_length,
use_cache=False,
)
return {
"config": config,
"input_ids": input_ids,
"attention_mask": input_mask,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
}
@require_torch
@require_retrieval
class RagModelIntegrationTests(unittest.TestCase):
@cached_property
def sequence_model(self):
return (
RagSequenceForGeneration.from_pretrained_question_encoder_generator(
"facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
)
.to(torch_device)
.eval()
)
@cached_property
def token_model(self):
return (
RagTokenForGeneration.from_pretrained_question_encoder_generator(
"facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
)
.to(torch_device)
.eval()
)
def get_rag_config(self):
question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
return RagConfig.from_question_encoder_generator_configs(
question_encoder_config,
generator_config,
bos_token_id=0,
decoder_start_token_id=2,
eos_token_id=2,
is_encoder_decoder=True,
pad_token_id=1,
vocab_size=50264,
title_sep=" / ",
doc_sep=" // ",
n_docs=5,
max_combined_length=300,
dataset="wiki_dpr",
dataset_split="train",
index_name="exact",
index_path=None,
use_dummy_dataset=True,
retrieval_vector_size=768,
retrieval_batch_size=8,
)
@slow
def test_rag_sequence_inference(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_sequence = self.sequence_model
rag_sequence.set_retriever(rag_retriever)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
decoder_input_ids = decoder_input_ids.to(torch_device)
with torch.no_grad():
output = rag_sequence(
input_ids,
labels=decoder_input_ids,
)
expected_shape = torch.Size([5, 5, 50264])
self.assertEqual(output.logits.shape, expected_shape)
expected_doc_scores = torch.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]]).to(torch_device)
_assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
expected_loss = torch.tensor([38.7446]).to(torch_device)
_assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
@slow
def test_rag_token_inference(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_token = self.token_model
rag_token.set_retriever(rag_retriever)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
decoder_input_ids = decoder_input_ids.to(torch_device)
with torch.no_grad():
output = rag_token(
input_ids,
labels=decoder_input_ids,
)
expected_shape = torch.Size([5, 5, 50264])
self.assertEqual(output.logits.shape, expected_shape)
expected_doc_scores = torch.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]]).to(torch_device)
_assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
expected_loss = torch.tensor([38.7045]).to(torch_device)
_assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
@slow
def test_rag_token_generate_beam(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_token = self.token_model
rag_token.set_retriever(rag_retriever)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
input_ids = input_ids.to(torch_device)
output_ids = rag_token.generate(
input_ids,
decoder_start_token_id=rag_token.generator.config.decoder_start_token_id,
num_beams=2,
num_return_sequences=2,
)
# sequence generate test
output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
# Expected outputs as given by model at integration time.
EXPECTED_OUTPUT_TEXT_1 = "The songwriting credits are credited to ABBA"
EXPECTED_OUTPUT_TEXT_2 = 'The songwriting credits are credited to "B'
self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
@slow
def test_rag_token_generate_batch(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_token = self.token_model
rag_token.set_retriever(rag_retriever)
questions = [
"who sings does he love me with reba",
"how many pages is invisible man by ralph ellison",
]
input_ids = rag_question_encoder_tokenizer.batch_encode_plus(
questions,
return_tensors="pt",
padding=True,
truncation=True,
).input_ids
input_ids = input_ids.to(torch_device)
output_ids = rag_token.generate(
input_ids,
decoder_start_token_id=rag_token.generator.config.decoder_start_token_id,
num_beams=4,
num_return_sequences=1,
max_length=10,
)
# sequence generate test
output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
# Expected outputs as given by model at integration time.
EXPECTED_OUTPUT_TEXT_1 = '"People Need Love" is the'
EXPECTED_OUTPUT_TEXT_2 = '"How many pages is invisible man'
self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
@slow
def test_rag_sequence_generate_batch(self):
# IMPORTAN: This test fails on GPU, but is fine on CPU -> beam search is very sensible
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_sequence = self.sequence_model
rag_sequence.set_retriever(rag_retriever)
questions = [
"who sings does he love me with reba",
"how many pages is invisible man by ralph ellison",
]
input_ids = rag_question_encoder_tokenizer.batch_encode_plus(
questions,
return_tensors="pt",
padding=True,
truncation=True,
).input_ids
input_ids = input_ids.to(torch_device)
output_ids = rag_sequence.generate(
input_ids,
decoder_start_token_id=rag_sequence.generator.config.decoder_start_token_id,
num_beams=4,
num_return_sequences=1,
max_length=10,
)
# sequence generate test
output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
# Expected outputs as given by model at integration time.
EXPECTED_OUTPUT_TEXT_1 = '"I Know Him So Well"'
EXPECTED_OUTPUT_TEXT_2 = '"Howl" chronicles the'
self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
@slow
def test_rag_sequence_generate_beam(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
rag_token = self.sequence_model
rag_token.set_retriever(rag_retriever)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
input_ids = input_ids.to(torch_device)
output_ids = rag_token.generate(
input_ids,
decoder_start_token_id=rag_token.generator.config.decoder_start_token_id,
num_beams=2,
num_return_sequences=2,
)
# sequence generate test
output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
# Expected outputs as given by model at integration time.
EXPECTED_OUTPUT_TEXT_1 = """ ABBA / small label like Playboy Records did not have the distribution resources to meet the demand for the single from retailers and radio programmers. The foursome decided to record their first album together in late 1972, and sessions began on 26 September 1972. The women shared lead vocals on "Nina, Pretty Ballerina" that day."""
EXPECTED_OUTPUT_TEXT_2 = """ ABBA / small label like Playboy Records did not have the distribution resources to meet the demand for the single from retailers and radio programmers. The foursome decided to record their first album together in late 1972, and sessions began on 26 September 1972. The women shared lead vocals on "Nina, Pretty Ballerina" (a top ten hit in Austria)"""
self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
@require_torch
@require_retrieval
class RagModelSaveLoadTests(unittest.TestCase):
def get_rag_config(self):
question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
return RagConfig.from_question_encoder_generator_configs(
question_encoder_config,
generator_config,
bos_token_id=0,
decoder_start_token_id=2,
eos_token_id=2,
is_encoder_decoder=True,
pad_token_id=1,
vocab_size=50264,
title_sep=" / ",
doc_sep=" // ",
n_docs=5,
max_combined_length=300,
dataset="wiki_dpr",
dataset_split="train",
index_name="exact",
index_path=None,
use_dummy_dataset=True,
retrieval_vector_size=768,
retrieval_batch_size=8,
)
@slow
def test_rag_sequence_from_pretrained(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
decoder_input_ids = decoder_input_ids.to(torch_device)
with tempfile.TemporaryDirectory() as tmp_dirname:
rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator(
"facebook/dpr-question_encoder-single-nq-base",
"facebook/bart-large-cnn",
retriever=rag_retriever,
config=rag_config,
).to(torch_device)
# check that the from pretrained methods work
rag_sequence.save_pretrained(tmp_dirname)
rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
rag_sequence.to(torch_device)
with torch.no_grad():
output = rag_sequence(
input_ids,
labels=decoder_input_ids,
)
loss_pretrained = output.loss
del rag_sequence
question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
rag_sequence = RagSequenceForGeneration(
config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
)
rag_sequence.to(torch_device)
with torch.no_grad():
output = rag_sequence(
input_ids,
labels=decoder_input_ids,
)
loss_init = output.loss
self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
@slow
def test_rag_token_from_pretrained(self):
rag_config = self.get_rag_config()
rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
"facebook/dpr-question_encoder-single-nq-base"
)
rag_retriever = RagRetriever(
rag_config,
question_encoder_tokenizer=rag_question_encoder_tokenizer,
generator_tokenizer=rag_decoder_tokenizer,
)
input_ids = rag_question_encoder_tokenizer(
"who sings does he love me with reba", return_tensors="pt"
).input_ids
decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
decoder_input_ids = decoder_input_ids.to(torch_device)
with tempfile.TemporaryDirectory() as tmp_dirname:
rag_token = RagTokenForGeneration.from_pretrained_question_encoder_generator(
"facebook/dpr-question_encoder-single-nq-base",
"facebook/bart-large-cnn",
retriever=rag_retriever,
config=rag_config,
).to(torch_device)
# check that the from pretrained methods work
rag_token.save_pretrained(tmp_dirname)
rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever)
rag_token.to(torch_device)
with torch.no_grad():
output = rag_token(
input_ids,
labels=decoder_input_ids,
)
loss_pretrained = output.loss
del rag_token
question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
rag_token = RagTokenForGeneration(
config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
)
rag_token.to(torch_device)
with torch.no_grad():
output = rag_token(
input_ids,
labels=decoder_input_ids,
)
loss_init = output.loss
self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
......@@ -35,28 +35,53 @@ if is_torch_available():
class T5ModelTester:
def __init__(self, parent):
def __init__(
self,
parent,
vocab_size=99,
n_positions=14,
batch_size=13,
encoder_seq_length=7,
decoder_seq_length=9,
# For common tests
seq_length=7,
is_training=True,
use_attention_mask=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=8,
dropout_rate=0.1,
initializer_factor=0.002,
eos_token_id=1,
pad_token_id=0,
decoder_start_token_id=0,
scope=None,
):
self.parent = parent
self.batch_size = 13
self.encoder_seq_length = 7
self.decoder_seq_length = 9
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = True
self.use_attention_mask = True
self.use_labels = True
self.vocab_size = 99
self.n_positions = 14
self.hidden_size = 32
self.num_hidden_layers = 5
self.num_attention_heads = 4
self.d_ff = 37
self.relative_attention_num_buckets = 8
self.dropout_rate = 0.1
self.initializer_factor = 0.002
self.eos_token_id = 1
self.pad_token_id = 0
self.decoder_start_token_id = 0
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.scope = None
def prepare_config_and_inputs(self):
......
import json
import os
import pickle
import shutil
import tempfile
from unittest import TestCase
from unittest.mock import patch
import numpy as np
from datasets import Dataset
import faiss
from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig
from transformers.configuration_rag import RagConfig
from transformers.retrieval_rag import RagRetriever
from transformers.testing_utils import require_datasets, require_faiss, require_torch
from transformers.tokenization_bart import BartTokenizer
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
@require_faiss
@require_datasets
class RagRetrieverTest(TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
self.retrieval_vector_size = 8
# DPR tok
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
os.makedirs(dpr_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
# BART tok
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"\u0120",
"\u0120l",
"\u0120n",
"\u0120lo",
"\u0120low",
"er",
"\u0120lowest",
"\u0120newer",
"\u0120wider",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
os.makedirs(bart_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
def get_bart_tokenizer(self) -> BartTokenizer:
return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def get_dummy_hf_index_retriever(self):
dataset = Dataset.from_dict(
{
"id": ["0", "1"],
"text": ["foo", "bar"],
"title": ["Foo", "Bar"],
"embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
}
)
dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
config = RagConfig(
retrieval_vector_size=self.retrieval_vector_size,
question_encoder=DPRConfig().to_dict(),
generator=BartConfig().to_dict(),
)
with patch("transformers.retrieval_rag.load_dataset") as mock_load_dataset:
mock_load_dataset.return_value = dataset
retriever = RagRetriever(
config,
question_encoder_tokenizer=self.get_dpr_tokenizer(),
generator_tokenizer=self.get_bart_tokenizer(),
)
return retriever
def get_dummy_legacy_index_retriever(self):
dataset = Dataset.from_dict(
{
"id": ["0", "1"],
"text": ["foo", "bar"],
"title": ["Foo", "Bar"],
"embeddings": [np.ones(self.retrieval_vector_size + 1), 2 * np.ones(self.retrieval_vector_size + 1)],
}
)
dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
index_file_name = os.path.join(self.tmpdirname, "hf_bert_base.hnswSQ8_correct_phi_128.c_index")
dataset.save_faiss_index("embeddings", index_file_name + ".index.dpr")
pickle.dump(dataset["id"], open(index_file_name + ".index_meta.dpr", "wb"))
passages_file_name = os.path.join(self.tmpdirname, "psgs_w100.tsv.pkl")
passages = {sample["id"]: [sample["text"], sample["title"]] for sample in dataset}
pickle.dump(passages, open(passages_file_name, "wb"))
config = RagConfig(
retrieval_vector_size=self.retrieval_vector_size,
question_encoder=DPRConfig().to_dict(),
generator=BartConfig().to_dict(),
index_name="legacy",
index_path=self.tmpdirname,
passages_path=self.tmpdirname,
)
retriever = RagRetriever(
config, question_encoder_tokenizer=self.get_dpr_tokenizer(), generator_tokenizer=self.get_bart_tokenizer()
)
return retriever
def test_hf_index_retriever_retrieve(self):
n_docs = 1
retriever = self.get_dummy_hf_index_retriever()
hidden_states = np.array(
[np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
)
retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
self.assertEqual(len(doc_dicts), 2)
self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
self.assertEqual(doc_dicts[0]["id"][0], "1") # max inner product is reached with second doc
self.assertEqual(doc_dicts[1]["id"][0], "0") # max inner product is reached with first doc
self.assertListEqual(list(doc_ids), [1, 0])
def test_legacy_index_retriever_retrieve(self):
n_docs = 1
retriever = self.get_dummy_legacy_index_retriever()
hidden_states = np.array(
[np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
)
retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
self.assertEqual(len(doc_dicts), 2)
self.assertEqual(sorted(doc_dicts[0]), ["text", "title"])
self.assertEqual(len(doc_dicts[0]["text"]), n_docs)
self.assertEqual(doc_dicts[0]["text"][0], "bar") # max inner product is reached with second doc
self.assertEqual(doc_dicts[1]["text"][0], "foo") # max inner product is reached with first doc
self.assertListEqual(list(doc_ids), [1, 0])
@require_torch
def test_hf_index_retriever_call(self):
import torch
n_docs = 1
retriever = self.get_dummy_hf_index_retriever()
question_input_ids = [[5, 7], [10, 11]]
hidden_states = np.array(
[np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
)
out = retriever(question_input_ids, hidden_states, prefix=retriever.config.generator.prefix, n_docs=n_docs)
context_input_ids, context_attention_mask, retrieved_doc_embeds = (
out["context_input_ids"],
out["context_attention_mask"],
out["retrieved_doc_embeds"],
)
self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
self.assertIsInstance(context_input_ids, list)
self.assertIsInstance(context_attention_mask, list)
self.assertIsInstance(retrieved_doc_embeds, np.ndarray)
out = retriever(
question_input_ids,
hidden_states,
prefix=retriever.config.generator.prefix,
n_docs=n_docs,
return_tensors="pt",
)
context_input_ids, context_attention_mask, retrieved_doc_embeds, doc_ids = ( # noqa: F841
out["context_input_ids"],
out["context_attention_mask"],
out["retrieved_doc_embeds"],
out["doc_ids"],
)
self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
self.assertIsInstance(context_input_ids, torch.Tensor)
self.assertIsInstance(context_attention_mask, torch.Tensor)
self.assertIsInstance(retrieved_doc_embeds, torch.Tensor)
import json
import os
import shutil
import tempfile
from unittest import TestCase
from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig
from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
from transformers.testing_utils import require_datasets, require_faiss, require_torch
from transformers.tokenization_bart import BartTokenizer
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
if is_torch_available() and is_datasets_available() and is_faiss_available():
from transformers.configuration_rag import RagConfig
from transformers.tokenization_rag import RagTokenizer
@require_faiss
@require_datasets
@require_torch
class RagTokenizerTest(TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
self.retrieval_vector_size = 8
# DPR tok
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
os.makedirs(dpr_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
# BART tok
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"\u0120",
"\u0120l",
"\u0120n",
"\u0120lo",
"\u0120low",
"er",
"\u0120lowest",
"\u0120newer",
"\u0120wider",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
os.makedirs(bart_tokenizer_path, exist_ok=True)
self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
def get_bart_tokenizer(self) -> BartTokenizer:
return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_with_saved_config(self):
save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict())
rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer())
rag_config.save_pretrained(save_dir)
rag_tokenizer.save_pretrained(save_dir)
new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizer)
self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab)
self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer)
self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment