quac.py 4.33 KB
Newer Older
Jonathan Tow's avatar
Jonathan Tow committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Address all TODOs and remove all explanatory comments
"""QuAC dataset."""


import json

import datasets


_CITATION = """\
@article{choi2018quac,
    title={Quac: Question answering in context},
    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
    journal={arXiv preprint arXiv:1808.07036},
    year={2018}
}
"""

_DESCRIPTION = """\
bzantium's avatar
bzantium committed
33
Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
Jonathan Tow's avatar
Jonathan Tow committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
participating in information seeking dialog. Data instances consist of an interactive
dialog between two crowd workers: (1) a student who poses a sequence of freeform
questions to learn as much as possible about a hidden Wikipedia text, and (2)
a teacher who answers the questions by providing short excerpts (spans) from the text.
"""

_HOMEPAGE = "https://quac.ai/"

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

_URLS = {
    "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
    "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
}


class Quac(datasets.GeneratorBasedBuilder):
    """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and  participating in information seeking dialog."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
bzantium's avatar
bzantium committed
57
58
59
        datasets.BuilderConfig(
            name="quac", version=VERSION, description="The QuAC dataset"
        ),
Jonathan Tow's avatar
Jonathan Tow committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    ]

    def _info(self):
        features = datasets.Features(
            {
                "title": datasets.Value("string"),
                "section_title": datasets.Value("string"),
                "paragraph": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir["train"],
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
bzantium's avatar
bzantium committed
95
                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
Jonathan Tow's avatar
Jonathan Tow committed
96
97
98
99
100
101
102
103
104
105
106
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)["data"]
            key = 0
            for row in data:
                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
                qas = row["paragraphs"][0]["qas"]
bzantium's avatar
bzantium committed
107
                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
Jonathan Tow's avatar
Jonathan Tow committed
108
109
110
111
112
113
114
115
116
117
                for (question, answer) in qa_pairs:
                    # Yields examples as (key, example) tuples
                    yield key, {
                        "title": row["title"],
                        "section_title": row["section_title"],
                        "paragraph": paragraph,
                        "question": question,
                        "answer": answer,
                    }
                    key += 1