"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "5e09af2acde21f232a6ed2ad2972c8f2269dcecf"
Unverified Commit 2cc3cc83 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Add `dataset_revision` argument to `RagConfig` (#29610)



* add arg

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 956f44f1
...@@ -111,6 +111,7 @@ class RagConfig(PretrainedConfig): ...@@ -111,6 +111,7 @@ class RagConfig(PretrainedConfig):
output_retrieved=False, output_retrieved=False,
use_cache=True, use_cache=True,
forced_eos_token_id=None, forced_eos_token_id=None,
dataset_revision=None,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(
...@@ -156,6 +157,7 @@ class RagConfig(PretrainedConfig): ...@@ -156,6 +157,7 @@ class RagConfig(PretrainedConfig):
self.passages_path = passages_path self.passages_path = passages_path
self.index_path = index_path self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision
self.output_retrieved = output_retrieved self.output_retrieved = output_retrieved
......
...@@ -266,6 +266,7 @@ class CanonicalHFIndex(HFIndexBase): ...@@ -266,6 +266,7 @@ class CanonicalHFIndex(HFIndexBase):
index_name: Optional[str] = None, index_name: Optional[str] = None,
index_path: Optional[str] = None, index_path: Optional[str] = None,
use_dummy_dataset=False, use_dummy_dataset=False,
dataset_revision=None,
): ):
if int(index_path is None) + int(index_name is None) != 1: if int(index_path is None) + int(index_name is None) != 1:
raise ValueError("Please provide `index_name` or `index_path`.") raise ValueError("Please provide `index_name` or `index_path`.")
...@@ -274,9 +275,14 @@ class CanonicalHFIndex(HFIndexBase): ...@@ -274,9 +275,14 @@ class CanonicalHFIndex(HFIndexBase):
self.index_name = index_name self.index_name = index_name
self.index_path = index_path self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision
logger.info(f"Loading passages from {self.dataset_name}") logger.info(f"Loading passages from {self.dataset_name}")
dataset = load_dataset( dataset = load_dataset(
self.dataset_name, with_index=False, split=self.dataset_split, dummy=self.use_dummy_dataset self.dataset_name,
with_index=False,
split=self.dataset_split,
dummy=self.use_dummy_dataset,
revision=dataset_revision,
) )
super().__init__(vector_size, dataset, index_initialized=False) super().__init__(vector_size, dataset, index_initialized=False)
...@@ -293,6 +299,7 @@ class CanonicalHFIndex(HFIndexBase): ...@@ -293,6 +299,7 @@ class CanonicalHFIndex(HFIndexBase):
split=self.dataset_split, split=self.dataset_split,
index_name=self.index_name, index_name=self.index_name,
dummy=self.use_dummy_dataset, dummy=self.use_dummy_dataset,
revision=self.dataset_revision,
) )
self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True) self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True)
self._index_initialized = True self._index_initialized = True
...@@ -427,6 +434,7 @@ class RagRetriever: ...@@ -427,6 +434,7 @@ class RagRetriever:
index_name=config.index_name, index_name=config.index_name,
index_path=config.index_path, index_path=config.index_path,
use_dummy_dataset=config.use_dummy_dataset, use_dummy_dataset=config.use_dummy_dataset,
dataset_revision=config.dataset_revision,
) )
@classmethod @classmethod
......
...@@ -730,6 +730,7 @@ class RagModelIntegrationTests(unittest.TestCase): ...@@ -730,6 +730,7 @@ class RagModelIntegrationTests(unittest.TestCase):
use_dummy_dataset=True, use_dummy_dataset=True,
retrieval_vector_size=768, retrieval_vector_size=768,
retrieval_batch_size=8, retrieval_batch_size=8,
dataset_revision="b24a417",
) )
@slow @slow
...@@ -905,7 +906,7 @@ class RagModelIntegrationTests(unittest.TestCase): ...@@ -905,7 +906,7 @@ class RagModelIntegrationTests(unittest.TestCase):
def test_rag_sequence_generate_batch(self): def test_rag_sequence_generate_batch(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained( retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
) )
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
torch_device torch_device
...@@ -944,7 +945,10 @@ class RagModelIntegrationTests(unittest.TestCase): ...@@ -944,7 +945,10 @@ class RagModelIntegrationTests(unittest.TestCase):
def test_rag_sequence_generate_batch_from_context_input_ids(self): def test_rag_sequence_generate_batch_from_context_input_ids(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained( retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True "facebook/rag-sequence-nq",
index_name="exact",
use_dummy_dataset=True,
dataset_revision="b24a417",
) )
rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to( rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
torch_device torch_device
...@@ -993,7 +997,9 @@ class RagModelIntegrationTests(unittest.TestCase): ...@@ -993,7 +997,9 @@ class RagModelIntegrationTests(unittest.TestCase):
@slow @slow
def test_rag_token_generate_batch(self): def test_rag_token_generate_batch(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) retriever = RagRetriever.from_pretrained(
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
)
rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to( rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to(
torch_device torch_device
) )
...@@ -1063,6 +1069,7 @@ class RagModelSaveLoadTests(unittest.TestCase): ...@@ -1063,6 +1069,7 @@ class RagModelSaveLoadTests(unittest.TestCase):
use_dummy_dataset=True, use_dummy_dataset=True,
retrieval_vector_size=768, retrieval_vector_size=768,
retrieval_batch_size=8, retrieval_batch_size=8,
dataset_revision="b24a417",
) )
@slow @slow
......
...@@ -590,6 +590,7 @@ class TFRagModelIntegrationTests(unittest.TestCase): ...@@ -590,6 +590,7 @@ class TFRagModelIntegrationTests(unittest.TestCase):
use_dummy_dataset=True, use_dummy_dataset=True,
retrieval_vector_size=768, retrieval_vector_size=768,
retrieval_batch_size=8, retrieval_batch_size=8,
dataset_revision="b24a417",
) )
@slow @slow
...@@ -794,7 +795,9 @@ class TFRagModelIntegrationTests(unittest.TestCase): ...@@ -794,7 +795,9 @@ class TFRagModelIntegrationTests(unittest.TestCase):
@slow @slow
def test_rag_token_greedy_search(self): def test_rag_token_greedy_search(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) retriever = RagRetriever.from_pretrained(
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
)
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
# check first two questions # check first two questions
...@@ -828,7 +831,9 @@ class TFRagModelIntegrationTests(unittest.TestCase): ...@@ -828,7 +831,9 @@ class TFRagModelIntegrationTests(unittest.TestCase):
def test_rag_token_generate_batch(self): def test_rag_token_generate_batch(self):
# NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test # NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True) retriever = RagRetriever.from_pretrained(
"facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
)
rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever) rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
input_dict = tokenizer( input_dict = tokenizer(
...@@ -871,7 +876,10 @@ class TFRagModelIntegrationTests(unittest.TestCase): ...@@ -871,7 +876,10 @@ class TFRagModelIntegrationTests(unittest.TestCase):
def test_rag_sequence_generate_batch(self): def test_rag_sequence_generate_batch(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained( retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True "facebook/rag-sequence-nq",
index_name="exact",
use_dummy_dataset=True,
dataset_revision="b24a417",
) )
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever) rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
...@@ -908,7 +916,7 @@ class TFRagModelIntegrationTests(unittest.TestCase): ...@@ -908,7 +916,7 @@ class TFRagModelIntegrationTests(unittest.TestCase):
def test_rag_sequence_generate_batch_from_context_input_ids(self): def test_rag_sequence_generate_batch_from_context_input_ids(self):
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained( retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417"
) )
rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever) rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
input_dict = tokenizer( input_dict = tokenizer(
...@@ -976,6 +984,7 @@ class TFRagModelSaveLoadTests(unittest.TestCase): ...@@ -976,6 +984,7 @@ class TFRagModelSaveLoadTests(unittest.TestCase):
use_dummy_dataset=True, use_dummy_dataset=True,
retrieval_vector_size=768, retrieval_vector_size=768,
retrieval_batch_size=8, retrieval_batch_size=8,
dataset_revision="b24a417",
) )
@slow @slow
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment