Unverified Commit 1da3d719 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge pull request #305 from jon-tow/add-triviaqa-dataset-features

Add dataset features to `TriviaQA`
parents 235f8d3f 70772cb2
{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "values": {"dtype": "string", "id": null, "_type": "Value"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "trivia_qa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 32846960, "num_examples": 87622, "dataset_name": "trivia_qa"}, "validation": {"name": "validation", "num_bytes": 4316214, "num_examples": 11313, "dataset_name": "trivia_qa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 37163174, "size_in_bytes": 583644555}}
\ No newline at end of file
{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
......@@ -13,6 +13,7 @@
# limitations under the License.
#
# Custom TriviaQA because HF version sanitizes the dataset differently.
# https://github.com/huggingface/datasets/blob/9977ade72191ff0b6907ec63935448c6269a91a1/datasets/trivia_qa/trivia_qa.py#L285
"""TriviaQA (Unfiltered Raw) dataset."""
......@@ -43,13 +44,12 @@ high quality distant supervision for answering the questions.
_HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_LICENSE = "Apache License 2.0"
_URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
class TriviaQa(datasets.GeneratorBasedBuilder):
class Triviaqa(datasets.GeneratorBasedBuilder):
""" TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """
VERSION = datasets.Version("0.0.1")
......@@ -62,13 +62,25 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
def _info(self):
features = datasets.Features(
{
"question_id": datasets.Value("string"),
"question_source": datasets.Value("string"),
"question": datasets.Value("string"),
"answer": {
"aliases": datasets.features.Sequence(
datasets.Value("string"),
),
"value": datasets.Value("string")
}
},
"search_results": datasets.features.Sequence(
{
"description": datasets.Value("string"),
"filename": datasets.Value("string"),
"rank": datasets.Value("int32"),
"title": datasets.Value("string"),
"url": datasets.Value("string"),
"search_context": datasets.Value("string"),
}
),
}
)
return datasets.DatasetInfo(
......@@ -88,7 +100,6 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"),
"split": "train",
},
),
datasets.SplitGenerator(
......@@ -96,20 +107,34 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"),
"split": "dev",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
def _generate_examples(self, filepath):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
data = json.loads(row)
search_results = []
for search_result in data["SearchResults"]:
search_results.append(
{
"description": search_result["Description"] if "Description" in search_result else "",
"filename": search_result["Filename"] if "Filename" in search_result else "",
"rank": search_result["Rank"] if "Rank" in search_result else -1,
"title": search_result["Title"] if "Title" in search_result else "",
"url": search_result["Url"] if "Url" in search_result else "",
"search_context": search_result["SearchContext"] if "SearchContext" in search_result else "",
}
)
yield key, {
"question_id": data["QuestionId"],
"question_source": data["QuestionSource"],
"question": data["Question"],
"answer": {
"aliases": data["Answer"]["Aliases"],
"value": data["Answer"]["Value"],
}
},
"search_results": search_results,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment