Merge pull request #305 from jon-tow/add-triviaqa-dataset-features

Add dataset features to `TriviaQA`

Merge pull request #305 from jon-tow/add-triviaqa-dataset-features
Add dataset features to `TriviaQA`
1da3d719 · Stella Biderman · GitHub · 235f8d3f · 70772cb2 · 1da3d719
Unverified Commit 1da3d719 authored Apr 28, 2022 by Stella Biderman Committed by GitHub Apr 28, 2022
Showing with 34 additions and 9 deletions

lm_eval/datasets/triviaqa/dataset_infos.json lm_eval/datasets/triviaqa/dataset_infos.json +1 -1

lm_eval/datasets/triviaqa/triviaqa.py lm_eval/datasets/triviaqa/triviaqa.py +33 -8

No files found.
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
-{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "values": {"dtype": "string", "id": null, "_type": "Value"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "trivia_qa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 32846960, "num_examples": 87622, "dataset_name": "trivia_qa"}, "validation": {"name": "validation", "num_bytes": 4316214, "num_examples": 11313, "dataset_name": "trivia_qa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 37163174, "size_in_bytes": 583644555}}
\ No newline at end of file
+{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 # Custom TriviaQA because HF version sanitizes the dataset differently.
+# https://github.com/huggingface/datasets/blob/9977ade72191ff0b6907ec63935448c6269a91a1/datasets/trivia_qa/trivia_qa.py#L285
 """TriviaQA (Unfiltered Raw) dataset."""


@@ -43,13 +44,12 @@ high quality distant supervision for answering the questions.

 _HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Apache License 2.0"

 _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"


-class TriviaQa(datasets.GeneratorBasedBuilder):
+class Triviaqa(datasets.GeneratorBasedBuilder):
    """ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """

    VERSION = datasets.Version("0.0.1")
@@ -62,13 +62,25 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
    def _info(self):
        features = datasets.Features(
            {
+                "question_id": datasets.Value("string"),
+                "question_source": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": {
                    "aliases":  datasets.features.Sequence(
                        datasets.Value("string"),
                    ),
                    "value": datasets.Value("string")
-                }
+                },
+                "search_results": datasets.features.Sequence(
+                    {
+                        "description": datasets.Value("string"),
+                        "filename": datasets.Value("string"),
+                        "rank": datasets.Value("int32"),
+                        "title": datasets.Value("string"),
+                        "url": datasets.Value("string"),
+                        "search_context": datasets.Value("string"),
+                    }
+                ),
            }
        )
        return datasets.DatasetInfo(
@@ -88,7 +100,6 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"),
-                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
@@ -96,20 +107,34 @@ class TriviaQa(datasets.GeneratorBasedBuilder):
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"),
-                    "split": "dev",
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
+    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
+                search_results = []
+                for search_result in data["SearchResults"]:
+                    search_results.append(
+                        {
+                            "description": search_result["Description"] if "Description" in search_result else "",
+                            "filename": search_result["Filename"] if "Filename" in search_result else "",
+                            "rank": search_result["Rank"] if "Rank" in search_result else -1,
+                            "title": search_result["Title"] if "Title" in search_result else "",
+                            "url": search_result["Url"] if "Url" in search_result else "",
+                            "search_context": search_result["SearchContext"] if "SearchContext" in search_result else "",
+                        }
+                    )
                yield key, {
+                    "question_id": data["QuestionId"],
+                    "question_source": data["QuestionSource"],
                    "question": data["Question"],
                    "answer": {
                        "aliases": data["Answer"]["Aliases"],
                        "value": data["Answer"]["Value"],
-                    }
+                    },
+                    "search_results": search_results,
                }