# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: Address all TODOs and remove all explanatory comments """QuAC dataset.""" import json import datasets _CITATION = """\ @article{choi2018quac, title={Quac: Question answering in context}, author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, journal={arXiv preprint arXiv:1808.07036}, year={2018} } """ _DESCRIPTION = """\ Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog. Data instances consist of an interactive dialog between two crowd workers: (1) a student who poses a sequence of freeform questions to learn as much as possible about a hidden Wikipedia text, and (2) a teacher who answers the questions by providing short excerpts (spans) from the text. """ _HOMEPAGE = "https://quac.ai/" # TODO: Add the licence for the dataset here if you can find it _LICENSE = "" _URLS = { "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json", "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json", } class Quac(datasets.GeneratorBasedBuilder): """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog.""" VERSION = datasets.Version("1.1.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name="quac", version=VERSION, description="The QuAC dataset" ), ] def _info(self): features = datasets.Features( { "title": datasets.Value("string"), "section_title": datasets.Value("string"), "paragraph": datasets.Value("string"), "question": datasets.Value("string"), "answer": datasets.Value("string"), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): urls = {"train": _URLS["train"], "validation": _URLS["validation"]} data_dir = dl_manager.download_and_extract(urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath": data_dir["train"], "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"filepath": data_dir["validation"], "split": "validation"}, ), ] # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` def _generate_examples(self, filepath, split): with open(filepath, encoding="utf-8") as f: data = json.load(f)["data"] key = 0 for row in data: paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "") qas = row["paragraphs"][0]["qas"] qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas] for (question, answer) in qa_pairs: # Yields examples as (key, example) tuples yield key, { "title": row["title"], "section_title": row["section_title"], "paragraph": paragraph, "question": question, "answer": answer, } key += 1