Commit b4c0275d authored by jon-tow's avatar jon-tow
Browse files

Remove custom datasets that are in HF

parent 7585ec56
{"gsm8k": {"description": "State-of-the-art language models can match human performance on many tasks, but \nthey still struggle to robustly perform multi-step mathematical reasoning. To \ndiagnose the failures of current models and support research, we introduce GSM8K,\na dataset of 8.5K high quality linguistically diverse grade school math word problems.\nWe find that even the largest transformer models fail to achieve high test performance, \ndespite the conceptual simplicity of this problem distribution.\n", "citation": "@misc{cobbe2021training,\n title={Training Verifiers to Solve Math Word Problems},\n author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n year={2021},\n eprint={2110.14168},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n", "homepage": "https://github.com/openai/grade-school-math", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8_k", "config_name": "gsm8k", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8_k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8_k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}}
\ No newline at end of file
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Grade School Math 8k dataset."""
import json
import datasets
_CITATION = """\
@misc{cobbe2021training,
title={Training Verifiers to Solve Math Word Problems},
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
year={2021},
eprint={2110.14168},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
"""
_DESCRIPTION = """\
State-of-the-art language models can match human performance on many tasks, but
they still struggle to robustly perform multi-step mathematical reasoning. To
diagnose the failures of current models and support research, we introduce GSM8K,
a dataset of 8.5K high quality linguistically diverse grade school math word problems.
We find that even the largest transformer models fail to achieve high test performance,
despite the conceptual simplicity of this problem distribution.
"""
_HOMEPAGE = "https://github.com/openai/grade-school-math"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl",
"test": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl",
}
class GSM8K(datasets.GeneratorBasedBuilder):
"""Grade School Math 8k"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(name="gsm8k", version=VERSION,
description="The Grade School Math 8k dataset."),
]
def _info(self):
features = datasets.Features(
{
"question": datasets.Value("string"),
"answer": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "test": _URLS["test"]}
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["test"],
"split": "test"
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
data = json.loads(row)
yield key, {
"question": data["question"],
"answer": data["answer"],
}
{"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n author={Stephanie Lin and Jacob Hilton and Owain Evans},\n year={2021},\n eprint={2109.07958},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
\ No newline at end of file
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TruthfulQA dataset."""
import csv
import json
import datasets
_CITATION = """\
@misc{lin2021truthfulqa,
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
author={Stephanie Lin and Jacob Hilton and Owain Evans},
year={2021},
eprint={2109.07958},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
TruthfulQA is a benchmark to measure whether a language model is truthful in
generating answers to questions. The benchmark comprises 817 questions that
span 38 categories, including health, law, finance and politics. Questions are
crafted so that some humans would answer falsely due to a false belief or
misconception. To perform well, models must avoid generating false answers
learned from imitating human texts.
"""
_HOMEPAGE = "https://github.com/sylinrl/TruthfulQA"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
class TruthfulqaConfig(datasets.BuilderConfig):
"""BuilderConfig for TruthfulQA."""
def __init__(self, url, features, **kwargs):
"""BuilderConfig for TruthfulQA.
Args:
url: *string*, the url to the specific subset of the GPT3 Arithmetic dataset.
features: *list[string]*, list of the features that will appear in the
feature dict.
"""
# Version history:
super().__init__(version=datasets.Version("0.0.1"), **kwargs)
self.url = url
self.features = features
class Truthfulqa(datasets.GeneratorBasedBuilder):
"""TruthfulQA is a benchmark to measure whether a language model is truthful in
generating answers to questions."""
BUILDER_CONFIGS = [
TruthfulqaConfig(
name="multiple_choice",
url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
features=datasets.Features({
"question": datasets.Value("string"),
"mc1_targets": {
"choices": datasets.features.Sequence(datasets.Value("string")),
"labels": datasets.features.Sequence(datasets.Value("int32")),
},
"mc2_targets": {
"choices": datasets.features.Sequence(datasets.Value("string")),
"labels": datasets.features.Sequence(datasets.Value("int32")),
}
}),
description="The multiple choice TruthfulQA task"
),
TruthfulqaConfig(
name="generation",
url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
features=datasets.Features({
"category": datasets.Value("string"),
"question": datasets.Value("string"),
"best_answer": datasets.Value("string"),
"correct_answers": datasets.features.Sequence(datasets.Value("string")),
"incorrect_answers": datasets.features.Sequence(datasets.Value("string")),
"source": datasets.Value("string"),
}),
description="The generative TruthfulQA task"
)
]
def _info(self):
return datasets.DatasetInfo(
description=f"{_DESCRIPTION}\n{self.config.description}",
features=self.config.features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = self.config.url
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir,
"split": "validation",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
if self.config.name == "multiple_choice":
# Multiple choice data is in a `JSON` file.
with open(filepath, encoding="utf-8") as f:
contents = json.load(f)
for key, row in enumerate(contents):
yield key, {
"question": row["question"],
"mc1_targets": {
"choices": row["mc1_targets"].keys(),
"labels": row["mc1_targets"].values(),
},
"mc2_targets": {
"choices": row["mc2_targets"].keys(),
"labels": row["mc2_targets"].values(),
}
}
else:
# Generation data is in a `CSV` file.
with open(filepath, newline='') as f:
contents = csv.DictReader(f)
for key, row in enumerate(contents):
# Ensure that references exist.
if not row['Correct Answers'] or not row['Incorrect Answers']:
continue
yield key, {
"category": row["Category"],
"question": row["Question"],
"best_answer": row["Best Answer"],
# split on ";"
"correct_answers": row["Correct Answers"].strip().split(";"),
"incorrect_answers": row["Incorrect Answers"].strip().split(";"),
"source": row["Source"],
}
...@@ -16,10 +16,7 @@ model's sample/generation function. ...@@ -16,10 +16,7 @@ model's sample/generation function.
Homepage: https://github.com/openai/grade-school-math Homepage: https://github.com/openai/grade-school-math
""" """
import inspect
import re import re
import lm_eval.datasets.gsm8k.gsm8k
from pathlib import Path
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -42,8 +39,8 @@ INVALID_ANS = "[invalid]" ...@@ -42,8 +39,8 @@ INVALID_ANS = "[invalid]"
class GradeSchoolMath8K(Task): class GradeSchoolMath8K(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = inspect.getfile(lm_eval.datasets.gsm8k.gsm8k) DATASET_PATH = "gsm8k"
DATASET_NAME = None DATASET_NAME = "main"
def has_training_docs(self): def has_training_docs(self):
return True return True
......
...@@ -19,11 +19,9 @@ we could try this? ...@@ -19,11 +19,9 @@ we could try this?
Homepage: https://github.com/sylinrl/TruthfulQA Homepage: https://github.com/sylinrl/TruthfulQA
""" """
import inspect
import numpy as np import numpy as np
import sacrebleu import sacrebleu
import datasets import datasets
import lm_eval.datasets.truthfulqa.truthfulqa
from rouge_score import rouge_scorer, scoring from rouge_score import rouge_scorer, scoring
from lm_eval.base import rf, Task from lm_eval.base import rf, Task
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -60,7 +58,7 @@ QA_PROMPT = ( ...@@ -60,7 +58,7 @@ QA_PROMPT = (
class TruthfulQAMultipleChoice(Task): class TruthfulQAMultipleChoice(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa) DATASET_PATH = "truthful_qa"
DATASET_NAME = "multiple_choice" DATASET_NAME = "multiple_choice"
def has_training_docs(self): def has_training_docs(self):
...@@ -158,7 +156,7 @@ class TruthfulQAMultipleChoice(Task): ...@@ -158,7 +156,7 @@ class TruthfulQAMultipleChoice(Task):
class TruthfulQAGeneration(Task): class TruthfulQAGeneration(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa) DATASET_PATH = "truthful_qa"
DATASET_NAME = "generation" DATASET_NAME = "generation"
def __init__(self): def __init__(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment