Add custom `image`less `HeadQA` dataset

897ae76d · Jon Tow · 13317f8c · 897ae76d · 897ae76d · 897ae76d
Commit 897ae76d authored Mar 18, 2022 by Jon Tow
3 changed files
--- a/lm_eval/datasets/headqa/dataset_infos.json
+++ b/lm_eval/datasets/headqa/dataset_infos.json
+{"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
\ No newline at end of file
--- a/lm_eval/datasets/headqa/headqa.py
+++ b/lm_eval/datasets/headqa/headqa.py
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: This is an exact copy of
+# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py 
+# with the exception of the `image` feature. This is to avoid adding `Pillow`
+# as a dependency.
+"""HEAD-QA: A Healthcare Dataset for Complex Reasoning"""
+import json
+import os
+import datasets
+_CITATION = """\
+@inproceedings{vilares-gomez-rodriguez-2019-head,
+    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
+    author = "Vilares, David  and
+      G{\'o}mez-Rodr{\'i}guez, Carlos",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1092",
+    doi = "10.18653/v1/P19-1092",
+    pages = "960--966",
+    abstract = "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.",
+}
+"""
+_DESCRIPTION = """\
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
+de Sanidad, Consumo y Bienestar Social.
+The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.
+"""
+_HOMEPAGE = "https://aghie.github.io/head-qa/"
+_LICENSE = "MIT License"
+_URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"
+_DIRS = {"es": "HEAD", "en": "HEAD_EN"}
+class HeadQA(datasets.GeneratorBasedBuilder):
+    """HEAD-QA: A Healthcare Dataset for Complex Reasoning"""
+    VERSION = datasets.Version("1.1.0")
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"),
+        datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"),
+    ]
+    DEFAULT_CONFIG_NAME = "es"
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "year": datasets.Value("string"),
+                    "category": datasets.Value("string"),
+                    "qid": datasets.Value("int32"),
+                    "qtext": datasets.Value("string"),
+                    "ra": datasets.Value("int32"),
+                    "answers": [
+                        {
+                            "aid": datasets.Value("int32"),
+                            "atext": datasets.Value("string"),
+                        }
+                    ],
+                }
+            ),
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        data_dir = dl_manager.download_and_extract(_URL)
+        dir = _DIRS[self.config.name]
+        data_lang_dir = os.path.join(data_dir, dir)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")},
+            ),
+        ]
+    def _generate_examples(self, data_dir, filepath):
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as f:
+            head_qa = json.load(f)
+            for exam_id, exam in enumerate(head_qa["exams"]):
+                content = head_qa["exams"][exam]
+                name = content["name"].strip()
+                year = content["year"].strip()
+                category = content["category"].strip()
+                for question in content["data"]:
+                    qid = int(question["qid"].strip())
+                    qtext = question["qtext"].strip()
+                    ra = int(question["ra"].strip())
+                    aids = [answer["aid"] for answer in question["answers"]]
+                    atexts = [answer["atext"].strip() for answer in question["answers"]]
+                    answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
+                    id_ = f"{exam_id}_{qid}"
+                    yield id_, {
+                        "name": name,
+                        "year": year,
+                        "category": category,
+                        "qid": qid,
+                        "qtext": qtext,
+                        "ra": ra,
+                        "answers": answers,
+                    }
--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -8,6 +8,8 @@ even for highly specialized humans.
 Homepage: https://aghie.github.io/head-qa/
 """
+import inspect
+import lm_eval.datasets.headqa.headqa
 from lm_eval.base import MultipleChoiceTask
@@ -25,7 +27,7 @@ _CITATION = """
 class HeadQABase(MultipleChoiceTask):
    VERSION = 0
-    DATASET_PATH = "head_qa"
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa)
    def has_training_docs(self):
        return True