headqa.py 6.72 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# NOTE: This is an exact copy of
Fabrizio Milo's avatar
Fabrizio Milo committed
16
# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
17
18
# with the exception of the `image` feature. This is to avoid adding `Pillow`
# as a dependency.
Jon Tow's avatar
Jon Tow committed
19
"""HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


import json
import os

import datasets


_CITATION = """\
@inproceedings{vilares-gomez-rodriguez-2019-head,
    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
    author = "Vilares, David  and
      G{\'o}mez-Rodr{\'i}guez, Carlos",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P19-1092",
    doi = "10.18653/v1/P19-1092",
    pages = "960--966",
    abstract = "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.",
}
"""

_DESCRIPTION = """\
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
de Sanidad, Consumo y Bienestar Social.
The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.
"""

_HOMEPAGE = "https://aghie.github.io/head-qa/"

Alain's avatar
Alain committed
54
55
56
57
# The Spanish data comes from the "Ministerio de Sanidad, Consumo y Bienestar Social", as indicated here : https://github.com/aghie/head-qa
# This Spanish data seems to follow the intellectual property rights stated here : https://www.sanidad.gob.es/avisoLegal/home.htm
# The English data was translated by the authors of head-qa (https://arxiv.org/pdf/1906.04701.pdf).
_LICENSE = "Custom license"
58
59
60
61
62
63
64
65
66
67
68
69

_URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"

_DIRS = {"es": "HEAD", "en": "HEAD_EN"}


class HeadQA(datasets.GeneratorBasedBuilder):
    """HEAD-QA: A Healthcare Dataset for Complex Reasoning"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
Fabrizio Milo's avatar
Fabrizio Milo committed
70
71
72
73
74
75
        datasets.BuilderConfig(
            name="es", version=VERSION, description="Spanish HEAD dataset"
        ),
        datasets.BuilderConfig(
            name="en", version=VERSION, description="English HEAD dataset"
        ),
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
    ]

    DEFAULT_CONFIG_NAME = "es"

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "name": datasets.Value("string"),
                    "year": datasets.Value("string"),
                    "category": datasets.Value("string"),
                    "qid": datasets.Value("int32"),
                    "qtext": datasets.Value("string"),
                    "ra": datasets.Value("int32"),
                    "answers": [
                        {
                            "aid": datasets.Value("int32"),
                            "atext": datasets.Value("string"),
                        }
                    ],
                }
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL)

        dir = _DIRS[self.config.name]
        data_lang_dir = os.path.join(data_dir, dir)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
Fabrizio Milo's avatar
Fabrizio Milo committed
115
116
117
118
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
                },
119
120
121
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
Fabrizio Milo's avatar
Fabrizio Milo committed
122
123
124
125
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
                },
126
127
128
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
Fabrizio Milo's avatar
Fabrizio Milo committed
129
130
131
132
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
                },
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
            ),
        ]

    def _generate_examples(self, data_dir, filepath):
        """Yields examples."""
        with open(filepath, encoding="utf-8") as f:
            head_qa = json.load(f)
            for exam_id, exam in enumerate(head_qa["exams"]):
                content = head_qa["exams"][exam]
                name = content["name"].strip()
                year = content["year"].strip()
                category = content["category"].strip()
                for question in content["data"]:
                    qid = int(question["qid"].strip())
                    qtext = question["qtext"].strip()
                    ra = int(question["ra"].strip())

                    aids = [answer["aid"] for answer in question["answers"]]
                    atexts = [answer["atext"].strip() for answer in question["answers"]]
Fabrizio Milo's avatar
Fabrizio Milo committed
152
153
154
                    answers = [
                        {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
                    ]
155
156
157
158
159
160
161
162
163
164
165

                    id_ = f"{exam_id}_{qid}"
                    yield id_, {
                        "name": name,
                        "year": year,
                        "category": category,
                        "qid": qid,
                        "qtext": qtext,
                        "ra": ra,
                        "answers": answers,
                    }