Commit 1f8a8c1d authored by jon-tow's avatar jon-tow
Browse files

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into remove-dataset

parents b4c0275d b0acb337
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.1.0") VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"), name="es", version=VERSION, description="Spanish HEAD dataset"
),
datasets.BuilderConfig(
name="en", version=VERSION, description="English HEAD dataset"
),
] ]
DEFAULT_CONFIG_NAME = "es" DEFAULT_CONFIG_NAME = "es"
...@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder):
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")}, gen_kwargs={
"data_dir": data_dir,
"filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
},
), ),
] ]
...@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder): ...@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder):
aids = [answer["aid"] for answer in question["answers"]] aids = [answer["aid"] for answer in question["answers"]]
atexts = [answer["atext"].strip() for answer in question["answers"]] atexts = [answer["atext"].strip() for answer in question["answers"]]
answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)] answers = [
{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
]
id_ = f"{exam_id}_{qid}" id_ = f"{exam_id}_{qid}"
yield id_, { yield id_, {
......
...@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
EthicsConfig( EthicsConfig(
name="commonsense", name="commonsense",
prefix="cm", prefix="cm",
features=datasets.Features({ features=datasets.Features(
{
"label": datasets.Value("int32"), "label": datasets.Value("int32"),
"input": datasets.Value("string"), "input": datasets.Value("string"),
"is_short": datasets.Value("bool"), "is_short": datasets.Value("bool"),
"edited": datasets.Value("bool"), "edited": datasets.Value("bool"),
}), }
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept." ),
description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
), ),
EthicsConfig( EthicsConfig(
name="deontology", name="deontology",
prefix="deontology", prefix="deontology",
features=datasets.Features({ features=datasets.Features(
{
"group_id": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"label": datasets.Value("int32"), "label": datasets.Value("int32"),
"scenario": datasets.Value("string"), "scenario": datasets.Value("string"),
"excuse": datasets.Value("string"), "excuse": datasets.Value("string"),
}), }
),
description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
), ),
EthicsConfig( EthicsConfig(
name="justice", name="justice",
prefix="justice", prefix="justice",
features=datasets.Features({ features=datasets.Features(
{
"group_id": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"label": datasets.Value("int32"), "label": datasets.Value("int32"),
"scenario": datasets.Value("string"), "scenario": datasets.Value("string"),
}), }
),
description="The Justice subset contains examples focusing on how a character treats another person", description="The Justice subset contains examples focusing on how a character treats another person",
), ),
EthicsConfig( EthicsConfig(
name="utilitarianism", name="utilitarianism",
prefix="util", prefix="util",
features=datasets.Features({ features=datasets.Features(
{
"activity": datasets.Value("string"), "activity": datasets.Value("string"),
"baseline": datasets.Value("string"), "baseline": datasets.Value("string"),
"rating": datasets.Value("string"), # Empty rating. "rating": datasets.Value("string"), # Empty rating.
}), }
),
description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
), ),
EthicsConfig( EthicsConfig(
name="virtue", name="virtue",
prefix="virtue", prefix="virtue",
features=datasets.Features({ features=datasets.Features(
{
"group_id": datasets.Value("int32"), "group_id": datasets.Value("int32"),
"label": datasets.Value("int32"), "label": datasets.Value("int32"),
"scenario": datasets.Value("string"), "scenario": datasets.Value("string"),
"trait": datasets.Value("string"), "trait": datasets.Value("string"),
}), }
),
description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
), ),
] ]
...@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_train.csv"), "filepath": os.path.join(
data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_train.csv",
),
"split": "train", "split": "train",
}, },
), ),
...@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder): ...@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_test.csv"), "filepath": os.path.join(
"split": "test" data_dir,
"ethics",
self.config.name,
f"{self.config.prefix}_test.csv",
),
"split": "test",
}, },
) ),
] ]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators` # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split): def _generate_examples(self, filepath, split):
with open(filepath, newline='') as f: with open(filepath, newline="") as f:
if self.config.name == "utilitarianism": if self.config.name == "utilitarianism":
contents = csv.DictReader( contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
f, fieldnames=['activity', "baseline"])
else: else:
contents = csv.DictReader(f) contents = csv.DictReader(f)
# For subsets with grouped scenarios, tag them with an id. # For subsets with grouped scenarios, tag them with an id.
......
...@@ -44,13 +44,13 @@ _LICENSE = "" ...@@ -44,13 +44,13 @@ _LICENSE = ""
_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar" _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
_NAMES = [ _NAMES = [
'algebra', "algebra",
'counting_and_probability', "counting_and_probability",
'geometry', "geometry",
'intermediate_algebra', "intermediate_algebra",
'number_theory', "number_theory",
'prealgebra', "prealgebra",
'precalculus', "precalculus",
] ]
...@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MATH", "train", self.config.name), "basepath": os.path.join(
data_dir, "MATH", "train", self.config.name
),
"split": "train", "split": "train",
}, },
), ),
...@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MATH", "test", self.config.name), "basepath": os.path.join(
"split": "test" data_dir, "MATH", "test", self.config.name
),
"split": "test",
}, },
), ),
] ]
...@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder): ...@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
def _generate_examples(self, basepath, split): def _generate_examples(self, basepath, split):
key = 0 key = 0
for file in sorted(pathlib.Path(basepath).iterdir()): for file in sorted(pathlib.Path(basepath).iterdir()):
with open(file, "r", encoding='utf-8') as f: with open(file, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
yield key, { yield key, {
"problem": data["problem"], "problem": data["problem"],
......
...@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder): ...@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="original", version=VERSION, description="The LAMBADA dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="en", version=VERSION, description="The English translated LAMBADA dataset"), name="original", version=VERSION, description="The LAMBADA dataset"
datasets.BuilderConfig(name="fr", version=VERSION, description="The French translated LAMBADA dataset"), ),
datasets.BuilderConfig(name="de", version=VERSION, description="The German translated LAMBADA dataset"), datasets.BuilderConfig(
datasets.BuilderConfig(name="it", version=VERSION, description="The Italian translated LAMBADA dataset"), name="en",
datasets.BuilderConfig(name="es", version=VERSION, description="The Spanish translated LAMBADA dataset"), version=VERSION,
description="The English translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="fr",
version=VERSION,
description="The French translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="de",
version=VERSION,
description="The German translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="it",
version=VERSION,
description="The Italian translated LAMBADA dataset",
),
datasets.BuilderConfig(
name="es",
version=VERSION,
description="The Spanish translated LAMBADA dataset",
),
] ]
DEFAULT_CONFIG_NAME = "original" DEFAULT_CONFIG_NAME = "original"
...@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder): ...@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder):
with open(filepath, encoding="utf-8") as f: with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f): for key, row in enumerate(f):
data = json.loads(row) data = json.loads(row)
yield key, { yield key, {"text": data["text"]}
"text": data["text"]
}
...@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."), datasets.BuilderConfig(
name="logiqa", version=VERSION, description="The LogiQA dataset."
),
] ]
def _info(self): def _info(self):
...@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
"label": datasets.Value("string"), "label": datasets.Value("string"),
"context": datasets.Value("string"), "context": datasets.Value("string"),
"question": datasets.Value("string"), "question": datasets.Value("string"),
"options": datasets.features.Sequence( "options": datasets.features.Sequence(datasets.Value("string")),
datasets.Value("string")
),
} }
) )
return datasets.DatasetInfo( return datasets.DatasetInfo(
...@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder):
) )
def _split_generators(self, dl_manager): def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]} urls = {
"train": _URLS["train"],
"test": _URLS["test"],
"validation": _URLS["validation"],
}
data_dir = dl_manager.download_and_extract(urls) data_dir = dl_manager.download_and_extract(urls)
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
...@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
"filepath": data_dir["test"],
"split": "test"
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
...@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder): ...@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
def _generate_examples(self, filepath, split): def _generate_examples(self, filepath, split):
def normalize(text): def normalize(text):
return text.replace(".", ". ").strip() return text.replace(".", ". ").strip()
with open(filepath, encoding="utf-8") as f: with open(filepath, encoding="utf-8") as f:
data = f.read().strip().split("\n\n") data = f.read().strip().split("\n\n")
for key, row in enumerate(data): for key, row in enumerate(data):
......
...@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."), datasets.BuilderConfig(
datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."), name="mutual", version=VERSION, description="The MuTual dataset."
),
datasets.BuilderConfig(
name="mutual_plus",
version=VERSION,
description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
),
] ]
def _info(self): def _info(self):
...@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "train"
),
"split": "train", "split": "train",
}, },
), ),
...@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "test"
),
"split": "test", "split": "test",
}, },
), ),
...@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"), "basepath": os.path.join(
data_dir, "MuTual-master", "data", self.config.name, "dev"
),
"split": "dev", "split": "dev",
}, },
), ),
...@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder): ...@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder):
for file in sorted(Path(basepath).iterdir()): for file in sorted(Path(basepath).iterdir()):
if file.suffix != ".txt": if file.suffix != ".txt":
continue continue
with open(file, "r", encoding='utf-8') as f: with open(file, "r", encoding="utf-8") as f:
data_str = f.read() data_str = f.read()
# Ignore the occasional empty file. # Ignore the occasional empty file.
if not data_str: if not data_str:
......
...@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder): ...@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TEST, name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
"filepath": data_dir["test"],
"split": "test"
},
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
......
...@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.1.0") VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"), datasets.BuilderConfig(
name="quac", version=VERSION, description="The QuAC dataset"
),
] ]
def _info(self): def _info(self):
...@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder):
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
"filepath": data_dir["validation"],
"split": "validation"
},
), ),
] ]
...@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder): ...@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder):
for row in data: for row in data:
paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "") paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
qas = row["paragraphs"][0]["qas"] qas = row["paragraphs"][0]["qas"]
qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas] qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
for (question, answer) in qa_pairs: for (question, answer) in qa_pairs:
# Yields examples as (key, example) tuples # Yields examples as (key, example) tuples
yield key, { yield key, {
......
...@@ -44,13 +44,16 @@ _LICENSE = "" ...@@ -44,13 +44,16 @@ _LICENSE = ""
class SatAnalogies(datasets.GeneratorBasedBuilder): class SatAnalogies(datasets.GeneratorBasedBuilder):
""" SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """ """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name="sat_analogies", version=VERSION, datasets.BuilderConfig(
description="The SAT Analogy Questions dataset"), name="sat_analogies",
version=VERSION,
description="The SAT Analogy Questions dataset",
),
] ]
@property @property
...@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
return ( return (
"To use SAT Analogy Questions you have to download it manually. Please " "To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). " "email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you recieve a download link for the dataset, supply the local path " "Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: " "as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`" "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
) )
...@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
{ {
"source": datasets.Value("string"), "source": datasets.Value("string"),
"stem": datasets.Value("string"), "stem": datasets.Value("string"),
"choices": datasets.features.Sequence( "choices": datasets.features.Sequence(datasets.Value("string")),
datasets.Value("string")
),
"solution": datasets.Value("string"), "solution": datasets.Value("string"),
} }
) )
...@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
if len(line) == 0 and record: if len(line) == 0 and record:
data.append(record) data.append(record)
record = [] record = []
elif len(line) > 0 and line[0] == '#': elif len(line) > 0 and line[0] == "#":
# Skip comments. # Skip comments.
continue continue
else: else:
...@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
choices = record[-6:-1] choices = record[-6:-1]
solution = record[-1] solution = record[-1]
yield key, { yield key, {
'source': source, "source": source,
'stem': stem, "stem": stem,
'choices': choices, "choices": choices,
'solution': solution, "solution": solution,
} }
...@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz" ...@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
class Triviaqa(datasets.GeneratorBasedBuilder): class Triviaqa(datasets.GeneratorBasedBuilder):
""" TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """ """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig( datasets.BuilderConfig(
name="triviaqa", version=VERSION, description="The TriviaQA dataset"), name="triviaqa", version=VERSION, description="The TriviaQA dataset"
),
] ]
def _info(self): def _info(self):
...@@ -69,7 +70,7 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -69,7 +70,7 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
"aliases": datasets.features.Sequence( "aliases": datasets.features.Sequence(
datasets.Value("string"), datasets.Value("string"),
), ),
"value": datasets.Value("string") "value": datasets.Value("string"),
}, },
"search_results": datasets.features.Sequence( "search_results": datasets.features.Sequence(
{ {
...@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
for search_result in data["SearchResults"]: for search_result in data["SearchResults"]:
search_results.append( search_results.append(
{ {
"description": search_result["Description"] if "Description" in search_result else "", "description": search_result["Description"]
"filename": search_result["Filename"] if "Filename" in search_result else "", if "Description" in search_result
"rank": search_result["Rank"] if "Rank" in search_result else -1, else "",
"title": search_result["Title"] if "Title" in search_result else "", "filename": search_result["Filename"]
"url": search_result["Url"] if "Url" in search_result else "", if "Filename" in search_result
"search_context": search_result["SearchContext"] if "SearchContext" in search_result else "", else "",
"rank": search_result["Rank"]
if "Rank" in search_result
else -1,
"title": search_result["Title"]
if "Title" in search_result
else "",
"url": search_result["Url"]
if "Url" in search_result
else "",
"search_context": search_result["SearchContext"]
if "SearchContext" in search_result
else "",
} }
) )
yield key, { yield key, {
......
...@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder): ...@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig(name=name, version=version, datasets.BuilderConfig(
description=_DESCRIPTIONS[name]) name=name, version=version, description=_DESCRIPTIONS[name]
)
for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment