Remove custom datasets that are in HF

b4c0275d · jon-tow · 7585ec56 · 7585ec56 · 7585ec56 · 7585ec56
Commit b4c0275d authored Jun 11, 2022 by jon-tow
8 changed files
--- a/lm_eval/datasets/gsm8k/__init__.py
+++ b/lm_eval/datasets/gsm8k/__init__.py
--- a/lm_eval/datasets/gsm8k/dataset_infos.json
+++ b/lm_eval/datasets/gsm8k/dataset_infos.json
-{"gsm8k": {"description": "State-of-the-art language models can match human performance on many tasks, but \nthey still struggle to robustly perform multi-step mathematical reasoning. To \ndiagnose the failures of current models and support research, we introduce GSM8K,\na dataset of 8.5K high quality linguistically diverse grade school math word problems.\nWe find that even the largest transformer models fail to achieve high test performance, \ndespite the conceptual simplicity of this problem distribution.\n", "citation": "@misc{cobbe2021training,\n      title={Training Verifiers to Solve Math Word Problems},\n      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},\n      year={2021},\n      eprint={2110.14168},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n", "homepage": "https://github.com/openai/grade-school-math", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "gsm8_k", "config_name": "gsm8k", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 3963202, "num_examples": 7473, "dataset_name": "gsm8_k"}, "test": {"name": "test", "num_bytes": 713732, "num_examples": 1319, "dataset_name": "gsm8_k"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl": {"num_bytes": 4166206, "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"}, "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl": {"num_bytes": 749738, "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"}}, "download_size": 4915944, "post_processing_size": null, "dataset_size": 4676934, "size_in_bytes": 9592878}}
\ No newline at end of file
--- a/lm_eval/datasets/gsm8k/gsm8k.py
+++ b/lm_eval/datasets/gsm8k/gsm8k.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Grade School Math 8k dataset."""
-
-
-import json
-
-import datasets
-
-
-_CITATION = """\
-@misc{cobbe2021training,
-      title={Training Verifiers to Solve Math Word Problems},
-      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
-      year={2021},
-      eprint={2110.14168},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-"""
-
-_DESCRIPTION = """\
-State-of-the-art language models can match human performance on many tasks, but 
-they still struggle to robustly perform multi-step mathematical reasoning. To 
-diagnose the failures of current models and support research, we introduce GSM8K,
-a dataset of 8.5K high quality linguistically diverse grade school math word problems.
-We find that even the largest transformer models fail to achieve high test performance, 
-despite the conceptual simplicity of this problem distribution.
-"""
-
-_HOMEPAGE = "https://github.com/openai/grade-school-math"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "train": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl",
-    "test": "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl",
-}
-
-
-class GSM8K(datasets.GeneratorBasedBuilder):
-    """Grade School Math 8k"""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="gsm8k", version=VERSION,
-                               description="The Grade School Math 8k dataset."),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "question": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "test": _URLS["test"]}
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["train"],
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            for key, row in enumerate(f):
-                data = json.loads(row)
-                yield key, {
-                    "question": data["question"],
-                    "answer": data["answer"],
-                }
--- a/lm_eval/datasets/truthfulqa/__init__.py
+++ b/lm_eval/datasets/truthfulqa/__init__.py
--- a/lm_eval/datasets/truthfulqa/dataset_infos.json
+++ b/lm_eval/datasets/truthfulqa/dataset_infos.json
-{"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
\ No newline at end of file
--- a/lm_eval/datasets/truthfulqa/truthfulqa.py
+++ b/lm_eval/datasets/truthfulqa/truthfulqa.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TruthfulQA dataset."""
-
-
-import csv
-import json
-
-import datasets
-
-
-_CITATION = """\
-@misc{lin2021truthfulqa,
-    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
-    author={Stephanie Lin and Jacob Hilton and Owain Evans},
-    year={2021},
-    eprint={2109.07958},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-TruthfulQA is a benchmark to measure whether a language model is truthful in
-generating answers to questions. The benchmark comprises 817 questions that
-span 38 categories, including health, law, finance and politics. Questions are
-crafted so that some humans would answer falsely due to a false belief or
-misconception. To perform well, models must avoid generating false answers
-learned from imitating human texts.
-"""
-
-_HOMEPAGE = "https://github.com/sylinrl/TruthfulQA"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-
-class TruthfulqaConfig(datasets.BuilderConfig):
-    """BuilderConfig for TruthfulQA."""
-
-    def __init__(self, url, features, **kwargs):
-        """BuilderConfig for TruthfulQA.
-
-        Args:
-        url: *string*, the url to the specific subset of the GPT3 Arithmetic dataset.
-        features: *list[string]*, list of the features that will appear in the
-            feature dict.
-        """
-        # Version history:
-        super().__init__(version=datasets.Version("0.0.1"), **kwargs)
-        self.url = url
-        self.features = features
-
-
-class Truthfulqa(datasets.GeneratorBasedBuilder):
-    """TruthfulQA is a benchmark to measure whether a language model is truthful in
-generating answers to questions."""
-
-    BUILDER_CONFIGS = [
-        TruthfulqaConfig(
-            name="multiple_choice",
-            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
-            features=datasets.Features({
-                "question": datasets.Value("string"),
-                "mc1_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
-                },
-                "mc2_targets": {
-                    "choices": datasets.features.Sequence(datasets.Value("string")),
-                    "labels": datasets.features.Sequence(datasets.Value("int32")),
-                }
-            }),
-            description="The multiple choice TruthfulQA task"
-        ),
-        TruthfulqaConfig(
-            name="generation",
-            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
-            features=datasets.Features({
-                "category": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "best_answer": datasets.Value("string"),
-                "correct_answers": datasets.features.Sequence(datasets.Value("string")),
-                "incorrect_answers": datasets.features.Sequence(datasets.Value("string")),
-                "source": datasets.Value("string"),
-            }),
-            description="The generative TruthfulQA task"
-        )
-    ]
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=f"{_DESCRIPTION}\n{self.config.description}",
-            features=self.config.features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = self.config.url
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        if self.config.name == "multiple_choice":
-            # Multiple choice data is in a `JSON` file.
-            with open(filepath, encoding="utf-8") as f:
-                contents = json.load(f)
-                for key, row in enumerate(contents):
-                    yield key, {
-                        "question": row["question"],
-                        "mc1_targets": {
-                            "choices": row["mc1_targets"].keys(),
-                            "labels": row["mc1_targets"].values(),
-                        },
-                        "mc2_targets": {
-                            "choices": row["mc2_targets"].keys(),
-                            "labels": row["mc2_targets"].values(),
-                        }
-                    }
-        else:
-            # Generation data is in a `CSV` file.
-            with open(filepath, newline='') as f:
-                contents = csv.DictReader(f)
-                for key, row in enumerate(contents):
-                    # Ensure that references exist.
-                    if not row['Correct Answers'] or not row['Incorrect Answers']:
-                        continue
-                    yield key, {
-                        "category": row["Category"],
-                        "question": row["Question"],
-                        "best_answer": row["Best Answer"],
-                        # split on ";"
-                        "correct_answers": row["Correct Answers"].strip().split(";"),
-                        "incorrect_answers": row["Incorrect Answers"].strip().split(";"),
-                        "source": row["Source"],
-                    }
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
@@ -16,10 +16,7 @@ model's sample/generation function.

 Homepage: https://github.com/openai/grade-school-math
 """
-import inspect
 import re
-import lm_eval.datasets.gsm8k.gsm8k
-from pathlib import Path
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean

@@ -42,8 +39,8 @@ INVALID_ANS = "[invalid]"

 class GradeSchoolMath8K(Task):
    VERSION = 0
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.gsm8k.gsm8k)
-    DATASET_NAME = None
+    DATASET_PATH = "gsm8k"
+    DATASET_NAME = "main"

    def has_training_docs(self):
        return True

--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -19,11 +19,9 @@ we could try this?

 Homepage: https://github.com/sylinrl/TruthfulQA
 """
-import inspect
 import numpy as np
 import sacrebleu
 import datasets
-import lm_eval.datasets.truthfulqa.truthfulqa
 from rouge_score import rouge_scorer, scoring
 from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
@@ -60,7 +58,7 @@ QA_PROMPT = (

 class TruthfulQAMultipleChoice(Task):
    VERSION = 1
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
+    DATASET_PATH = "truthful_qa"
    DATASET_NAME = "multiple_choice"

    def has_training_docs(self):
@@ -158,7 +156,7 @@ class TruthfulQAMultipleChoice(Task):

 class TruthfulQAGeneration(Task):
    VERSION = 1
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
+    DATASET_PATH = "truthful_qa"
    DATASET_NAME = "generation"

    def __init__(self):