add pre-commit

121b7096 · Fabrizio Milo · 7a038118 · 121b7096 · 121b7096 · 121b7096
Commit 121b7096 authored May 02, 2022 by Fabrizio Milo
20 changed files
--- a/lm_eval/datasets/headqa/dataset_infos.json
+++ b/lm_eval/datasets/headqa/dataset_infos.json
 {"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
\ No newline at end of file
--- a/lm_eval/datasets/headqa/headqa.py
+++ b/lm_eval/datasets/headqa/headqa.py
-# coding=utf-8
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +13,7 @@
 # limitations under the License.
 #
 # NOTE: This is an exact copy of
-# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py 
+# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
 # with the exception of the `image` feature. This is to avoid adding `Pillow`
 # as a dependency.
 """HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"),
+        datasets.BuilderConfig(
-        datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"),
+            name="es", version=VERSION, description="Spanish HEAD dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en", version=VERSION, description="English HEAD dataset"
+        ),
    ]
    DEFAULT_CONFIG_NAME = "es"
@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder):
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
+                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
+                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
+                },
            ),
        ]
@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder):
                    aids = [answer["aid"] for answer in question["answers"]]
                    atexts = [answer["atext"].strip() for answer in question["answers"]]
-                    answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
+                    answers = [
+                        {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
+                    ]
                    id_ = f"{exam_id}_{qid}"
                    yield id_, {

--- a/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
 {"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
\ No newline at end of file
--- a/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
+++ b/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
        EthicsConfig(
            name="commonsense",
            prefix="cm",
-            features=datasets.Features({
+            features=datasets.Features(
-                "label": datasets.Value("int32"),
+                {
-                "input": datasets.Value("string"),
+                    "label": datasets.Value("int32"),
-                "is_short": datasets.Value("bool"),
+                    "input": datasets.Value("string"),
-                "edited": datasets.Value("bool"),
+                    "is_short": datasets.Value("bool"),
-            }),
+                    "edited": datasets.Value("bool"),
-            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept."
+                }
+            ),
+            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
        ),
        EthicsConfig(
            name="deontology",
            prefix="deontology",
-            features=datasets.Features({
+            features=datasets.Features(
-                "group_id": datasets.Value("int32"),
+                {
-                "label": datasets.Value("int32"),
+                    "group_id": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
+                    "label": datasets.Value("int32"),
-                "excuse": datasets.Value("string"),
+                    "scenario": datasets.Value("string"),
-            }),
+                    "excuse": datasets.Value("string"),
+                }
+            ),
            description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
        ),
        EthicsConfig(
            name="justice",
            prefix="justice",
-            features=datasets.Features({
+            features=datasets.Features(
-                "group_id": datasets.Value("int32"),
+                {
-                "label": datasets.Value("int32"),
+                    "group_id": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
+                    "label": datasets.Value("int32"),
-            }),
+                    "scenario": datasets.Value("string"),
+                }
+            ),
            description="The Justice subset contains examples focusing on how a character treats another person",
        ),
        EthicsConfig(
            name="utilitarianism",
            prefix="util",
-            features=datasets.Features({
+            features=datasets.Features(
-                "activity": datasets.Value("string"),
+                {
-                "baseline": datasets.Value("string"),
+                    "activity": datasets.Value("string"),
-                "rating": datasets.Value("string"),  # Empty rating.
+                    "baseline": datasets.Value("string"),
-            }),
+                    "rating": datasets.Value("string"),  # Empty rating.
+                }
+            ),
            description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
        ),
        EthicsConfig(
            name="virtue",
            prefix="virtue",
-            features=datasets.Features({
+            features=datasets.Features(
-                "group_id": datasets.Value("int32"),
+                {
-                "label": datasets.Value("int32"),
+                    "group_id": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
+                    "label": datasets.Value("int32"),
-                "trait": datasets.Value("string"),
+                    "scenario": datasets.Value("string"),
-            }),
+                    "trait": datasets.Value("string"),
+                }
+            ),
            description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
        ),
    ]
@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_train.csv"),
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_train.csv",
+                    ),
                    "split": "train",
                },
            ),
@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_test.csv"),
+                    "filepath": os.path.join(
-                    "split": "test"
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_test.csv",
+                    ),
+                    "split": "test",
                },
-            )
+            ),
        ]
    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
-        with open(filepath, newline='') as f:
+        with open(filepath, newline="") as f:
            if self.config.name == "utilitarianism":
-                contents = csv.DictReader(
+                contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
-                    f, fieldnames=['activity', "baseline"])
            else:
                contents = csv.DictReader(f)
            # For subsets with grouped scenarios, tag them with an id.

--- a/lm_eval/datasets/hendrycks_math/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_math/dataset_infos.json
--- a/lm_eval/datasets/hendrycks_math/hendrycks_math.py
+++ b/lm_eval/datasets/hendrycks_math/hendrycks_math.py
@@ -44,13 +44,13 @@ _LICENSE = ""
 _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
 _NAMES = [
-    'algebra',
+    "algebra",
-    'counting_and_probability',
+    "counting_and_probability",
-    'geometry',
+    "geometry",
-    'intermediate_algebra',
+    "intermediate_algebra",
-    'number_theory',
+    "number_theory",
-    'prealgebra',
+    "prealgebra",
-    'precalculus',
+    "precalculus",
 ]
@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "train", self.config.name),
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "train", self.config.name
+                    ),
                    "split": "train",
                },
            ),
@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "test", self.config.name),
+                    "basepath": os.path.join(
-                    "split": "test"
+                        data_dir, "MATH", "test", self.config.name
+                    ),
+                    "split": "test",
                },
            ),
        ]
@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, basepath, split):
        key = 0
        for file in sorted(pathlib.Path(basepath).iterdir()):
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                data = json.load(f)
                yield key, {
                    "problem": data["problem"],

--- a/lm_eval/datasets/lambada/dataset_infos.json
+++ b/lm_eval/datasets/lambada/dataset_infos.json
--- a/lm_eval/datasets/lambada/lambada.py
+++ b/lm_eval/datasets/lambada/lambada.py
@@ -22,7 +22,7 @@ import datasets
 _CITATION = """\
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="original", version=VERSION, description="The LAMBADA dataset"),
+        datasets.BuilderConfig(
-        datasets.BuilderConfig(name="en", version=VERSION, description="The English translated LAMBADA dataset"),
+            name="original", version=VERSION, description="The LAMBADA dataset"
-        datasets.BuilderConfig(name="fr", version=VERSION, description="The French translated LAMBADA dataset"),
+        ),
-        datasets.BuilderConfig(name="de", version=VERSION, description="The German translated LAMBADA dataset"),
+        datasets.BuilderConfig(
-        datasets.BuilderConfig(name="it", version=VERSION, description="The Italian translated LAMBADA dataset"),
+            name="en",
-        datasets.BuilderConfig(name="es", version=VERSION, description="The Spanish translated LAMBADA dataset"),
+            version=VERSION,
+            description="The English translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="fr",
+            version=VERSION,
+            description="The French translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="de",
+            version=VERSION,
+            description="The German translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="it",
+            version=VERSION,
+            description="The Italian translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="es",
+            version=VERSION,
+            description="The Spanish translated LAMBADA dataset",
+        ),
    ]
    DEFAULT_CONFIG_NAME = "original"
@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
-                yield key, {
+                yield key, {"text": data["text"]}
-                    "text": data["text"]
-                }
--- a/lm_eval/datasets/logiqa/dataset_infos.json
+++ b/lm_eval/datasets/logiqa/dataset_infos.json
 {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
\ No newline at end of file
--- a/lm_eval/datasets/logiqa/logiqa.py
+++ b/lm_eval/datasets/logiqa/logiqa.py
@@ -19,7 +19,7 @@ import datasets
 _CITATION = """\
 @misc{liu2020logiqa,
-    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
    year={2020},
    eprint={2007.08124},
@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."),
+        datasets.BuilderConfig(
+            name="logiqa", version=VERSION, description="The LogiQA dataset."
+        ),
    ]
    def _info(self):
@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
                "label": datasets.Value("string"),
                "context": datasets.Value("string"),
                "question": datasets.Value("string"),
-                "options": datasets.features.Sequence(
+                "options": datasets.features.Sequence(datasets.Value("string")),
-                    datasets.Value("string")
-                ),
            }
        )
        return datasets.DatasetInfo(
@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder):
        )
    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]}
+        urls = {
+            "train": _URLS["train"],
+            "test": _URLS["test"],
+            "validation": _URLS["validation"],
+        }
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, filepath, split):
        def normalize(text):
            return text.replace(".", ". ").strip()
        with open(filepath, encoding="utf-8") as f:
            data = f.read().strip().split("\n\n")
            for key, row in enumerate(data):

--- a/lm_eval/datasets/mutual/dataset_infos.json
+++ b/lm_eval/datasets/mutual/dataset_infos.json
 {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
\ No newline at end of file
--- a/lm_eval/datasets/mutual/mutual.py
+++ b/lm_eval/datasets/mutual/mutual.py
@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."),
+        datasets.BuilderConfig(
-        datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."),
+            name="mutual", version=VERSION, description="The MuTual dataset."
+        ),
+        datasets.BuilderConfig(
+            name="mutual_plus",
+            version=VERSION,
+            description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
+        ),
    ]
    def _info(self):
@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "train"
+                    ),
                    "split": "train",
                },
            ),
@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "test"
+                    ),
                    "split": "test",
                },
            ),
@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "dev"
+                    ),
                    "split": "dev",
                },
            ),
@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder):
        for file in sorted(Path(basepath).iterdir()):
            if file.suffix != ".txt":
                continue
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                data_str = f.read()
                # Ignore the occasional empty file.
                if not data_str:

--- a/lm_eval/datasets/pile/dataset_infos.json
+++ b/lm_eval/datasets/pile/dataset_infos.json
--- a/lm_eval/datasets/pile/pile.py
+++ b/lm_eval/datasets/pile/pile.py
@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,

--- a/lm_eval/datasets/quac/dataset_infos.json
+++ b/lm_eval/datasets/quac/dataset_infos.json
 {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
\ No newline at end of file
--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
@@ -30,7 +30,7 @@ _CITATION = """\
 """
 _DESCRIPTION = """\
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 participating in information seeking dialog. Data instances consist of an interactive
 dialog between two crowd workers: (1) a student who poses a sequence of freeform
 questions to learn as much as possible about a hidden Wikipedia text, and (2)
@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"),
+        datasets.BuilderConfig(
+            name="quac", version=VERSION, description="The QuAC dataset"
+        ),
    ]
    def _info(self):
@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
+                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
-                    "filepath": data_dir["validation"],
-                    "split": "validation"
-                },
            ),
        ]
@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            for row in data:
                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
                qas = row["paragraphs"][0]["qas"]
-                qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
                for (question, answer) in qa_pairs:
                    # Yields examples as (key, example) tuples
                    yield key, {

--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -44,13 +44,16 @@ _LICENSE = ""
 class SatAnalogies(datasets.GeneratorBasedBuilder):
-    """ SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """
+    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
    VERSION = datasets.Version("0.0.1")
    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="sat_analogies", version=VERSION,
+        datasets.BuilderConfig(
-                               description="The SAT Analogy Questions dataset"),
+            name="sat_analogies",
+            version=VERSION,
+            description="The SAT Analogy Questions dataset",
+        ),
    ]
    @property
@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            {
                "source": datasets.Value("string"),
                "stem": datasets.Value("string"),
-                "choices": datasets.features.Sequence(
+                "choices": datasets.features.Sequence(datasets.Value("string")),
-                    datasets.Value("string")
-                ),
                "solution": datasets.Value("string"),
            }
        )
@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
                if len(line) == 0 and record:
                    data.append(record)
                    record = []
-                elif len(line) > 0 and line[0] == '#':
+                elif len(line) > 0 and line[0] == "#":
                    # Skip comments.
                    continue
                else:
@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            choices = record[-6:-1]
            solution = record[-1]
            yield key, {
-                'source': source,
+                "source": source,
-                'stem': stem,
+                "stem": stem,
-                'choices': choices,
+                "choices": choices,
-                'solution': solution,
+                "solution": solution,
            }
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
 {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
 class Triviaqa(datasets.GeneratorBasedBuilder):
-    """ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """
+    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
    VERSION = datasets.Version("0.0.1")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
-            name="triviaqa", version=VERSION, description="The TriviaQA dataset"),
+            name="triviaqa", version=VERSION, description="The TriviaQA dataset"
+        ),
    ]
    def _info(self):
@@ -66,10 +67,10 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
                "question_source": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": {
-                    "aliases":  datasets.features.Sequence(
+                    "aliases": datasets.features.Sequence(
                        datasets.Value("string"),
                    ),
-                    "value": datasets.Value("string")
+                    "value": datasets.Value("string"),
                },
                "search_results": datasets.features.Sequence(
                    {
@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
                for search_result in data["SearchResults"]:
                    search_results.append(
                        {
-                            "description": search_result["Description"] if "Description" in search_result else "",
+                            "description": search_result["Description"]
-                            "filename": search_result["Filename"] if "Filename" in search_result else "",
+                            if "Description" in search_result
-                            "rank": search_result["Rank"] if "Rank" in search_result else -1,
+                            else "",
-                            "title": search_result["Title"] if "Title" in search_result else "",
+                            "filename": search_result["Filename"]
-                            "url": search_result["Url"] if "Url" in search_result else "",
+                            if "Filename" in search_result
-                            "search_context": search_result["SearchContext"] if "SearchContext" in search_result else "",
+                            else "",
+                            "rank": search_result["Rank"]
+                            if "Rank" in search_result
+                            else -1,
+                            "title": search_result["Title"]
+                            if "Title" in search_result
+                            else "",
+                            "url": search_result["Url"]
+                            if "Url" in search_result
+                            else "",
+                            "search_context": search_result["SearchContext"]
+                            if "SearchContext" in search_result
+                            else "",
                        }
                    )
                yield key, {

--- a/lm_eval/datasets/truthfulqa/dataset_infos.json
+++ b/lm_eval/datasets/truthfulqa/dataset_infos.json
 {"multiple_choice": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe multiple choice TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "mc1_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "mc2_targets": {"choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "multiple_choice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 610333, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json": {"num_bytes": 710607, "checksum": "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"}}, "download_size": 710607, "post_processing_size": null, "dataset_size": 610333, "size_in_bytes": 1320940}, "generation": {"description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.\n\nThe generative TruthfulQA task", "citation": "@misc{lin2021truthfulqa,\n    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},\n    author={Stephanie Lin and Jacob Hilton and Owain Evans},\n    year={2021},\n    eprint={2109.07958},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/sylinrl/TruthfulQA", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "best_answer": {"dtype": "string", "id": null, "_type": "Value"}, "correct_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "incorrect_answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "truthfulqa", "config_name": "generation", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 463860, "num_examples": 817, "dataset_name": "truthfulqa"}}, "download_checksums": {"https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv": {"num_bytes": 443723, "checksum": "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"}}, "download_size": 443723, "post_processing_size": null, "dataset_size": 463860, "size_in_bytes": 907583}}
\ No newline at end of file