Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into remove-dataset

1f8a8c1d · jon-tow · b4c0275d · b0acb337 · 1f8a8c1d · 1f8a8c1d
Commit 1f8a8c1d authored Jun 11, 2022 by jon-tow
20 changed files
--- a/lm_eval/datasets/headqa/headqa.py
+++ b/lm_eval/datasets/headqa/headqa.py
-# coding=utf-8
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +13,7 @@
 # limitations under the License.
 #
 # NOTE: This is an exact copy of
-# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py 
+# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
 # with the exception of the `image` feature. This is to avoid adding `Pillow`
 # as a dependency.
 """HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
@@ -65,8 +64,12 @@ class HeadQA(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"),
-        datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"),
+        datasets.BuilderConfig(
+            name="es", version=VERSION, description="Spanish HEAD dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en", version=VERSION, description="English HEAD dataset"
+        ),
    ]

    DEFAULT_CONFIG_NAME = "es"
@@ -106,15 +109,24 @@ class HeadQA(datasets.GeneratorBasedBuilder):
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"train_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
+                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"test_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
+                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
-                gen_kwargs={"data_dir": data_dir, "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json")},
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
+                },
            ),
        ]

@@ -134,7 +146,9 @@ class HeadQA(datasets.GeneratorBasedBuilder):

                    aids = [answer["aid"] for answer in question["answers"]]
                    atexts = [answer["atext"].strip() for answer in question["answers"]]
-                    answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]
+                    answers = [
+                        {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
+                    ]

                    id_ = f"{exam_id}_{qid}"
                    yield id_, {

--- a/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
-{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
\ No newline at end of file
+{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
--- a/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
+++ b/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
@@ -71,54 +71,64 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
        EthicsConfig(
            name="commonsense",
            prefix="cm",
-            features=datasets.Features({
-                "label": datasets.Value("int32"),
-                "input": datasets.Value("string"),
-                "is_short": datasets.Value("bool"),
-                "edited": datasets.Value("bool"),
-            }),
-            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept."
+            features=datasets.Features(
+                {
+                    "label": datasets.Value("int32"),
+                    "input": datasets.Value("string"),
+                    "is_short": datasets.Value("bool"),
+                    "edited": datasets.Value("bool"),
+                }
+            ),
+            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
        ),
        EthicsConfig(
            name="deontology",
            prefix="deontology",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-                "excuse": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "excuse": datasets.Value("string"),
+                }
+            ),
            description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
        ),
        EthicsConfig(
            name="justice",
            prefix="justice",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                }
+            ),
            description="The Justice subset contains examples focusing on how a character treats another person",
        ),
        EthicsConfig(
            name="utilitarianism",
            prefix="util",
-            features=datasets.Features({
-                "activity": datasets.Value("string"),
-                "baseline": datasets.Value("string"),
-                "rating": datasets.Value("string"),  # Empty rating.
-            }),
+            features=datasets.Features(
+                {
+                    "activity": datasets.Value("string"),
+                    "baseline": datasets.Value("string"),
+                    "rating": datasets.Value("string"),  # Empty rating.
+                }
+            ),
            description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
        ),
        EthicsConfig(
            name="virtue",
            prefix="virtue",
-            features=datasets.Features({
-                "group_id": datasets.Value("int32"),
-                "label": datasets.Value("int32"),
-                "scenario": datasets.Value("string"),
-                "trait": datasets.Value("string"),
-            }),
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "trait": datasets.Value("string"),
+                }
+            ),
            description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
        ),
    ]
@@ -140,7 +150,12 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_train.csv"),
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_train.csv",
+                    ),
                    "split": "train",
                },
            ),
@@ -148,18 +163,22 @@ class HendrycksEthics(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "ethics", self.config.name, f"{self.config.prefix}_test.csv"),
-                    "split": "test"
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_test.csv",
+                    ),
+                    "split": "test",
                },
-            )
+            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
-        with open(filepath, newline='') as f:
+        with open(filepath, newline="") as f:
            if self.config.name == "utilitarianism":
-                contents = csv.DictReader(
-                    f, fieldnames=['activity', "baseline"])
+                contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
            else:
                contents = csv.DictReader(f)
            # For subsets with grouped scenarios, tag them with an id.

--- a/lm_eval/datasets/hendrycks_math/dataset_infos.json
+++ b/lm_eval/datasets/hendrycks_math/dataset_infos.json
--- a/lm_eval/datasets/hendrycks_math/hendrycks_math.py
+++ b/lm_eval/datasets/hendrycks_math/hendrycks_math.py
@@ -44,13 +44,13 @@ _LICENSE = ""
 _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"

 _NAMES = [
-    'algebra',
-    'counting_and_probability',
-    'geometry',
-    'intermediate_algebra',
-    'number_theory',
-    'prealgebra',
-    'precalculus',
+    "algebra",
+    "counting_and_probability",
+    "geometry",
+    "intermediate_algebra",
+    "number_theory",
+    "prealgebra",
+    "precalculus",
 ]


@@ -89,7 +89,9 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "train", self.config.name),
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "train", self.config.name
+                    ),
                    "split": "train",
                },
            ),
@@ -97,8 +99,10 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MATH", "test", self.config.name),
-                    "split": "test"
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "test", self.config.name
+                    ),
+                    "split": "test",
                },
            ),
        ]
@@ -107,7 +111,7 @@ class HendrycksMath(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, basepath, split):
        key = 0
        for file in sorted(pathlib.Path(basepath).iterdir()):
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                data = json.load(f)
                yield key, {
                    "problem": data["problem"],

--- a/lm_eval/datasets/lambada/dataset_infos.json
+++ b/lm_eval/datasets/lambada/dataset_infos.json
--- a/lm_eval/datasets/lambada/lambada.py
+++ b/lm_eval/datasets/lambada/lambada.py
@@ -22,7 +22,7 @@ import datasets

 _CITATION = """\
 @misc{
-    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
    title={The LAMBADA dataset},
    DOI={10.5281/zenodo.2630551},
    publisher={Zenodo},
@@ -62,12 +62,34 @@ class Lambada(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="original", version=VERSION, description="The LAMBADA dataset"),
-        datasets.BuilderConfig(name="en", version=VERSION, description="The English translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="fr", version=VERSION, description="The French translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="de", version=VERSION, description="The German translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="it", version=VERSION, description="The Italian translated LAMBADA dataset"),
-        datasets.BuilderConfig(name="es", version=VERSION, description="The Spanish translated LAMBADA dataset"),
+        datasets.BuilderConfig(
+            name="original", version=VERSION, description="The LAMBADA dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en",
+            version=VERSION,
+            description="The English translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="fr",
+            version=VERSION,
+            description="The French translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="de",
+            version=VERSION,
+            description="The German translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="it",
+            version=VERSION,
+            description="The Italian translated LAMBADA dataset",
+        ),
+        datasets.BuilderConfig(
+            name="es",
+            version=VERSION,
+            description="The Spanish translated LAMBADA dataset",
+        ),
    ]

    DEFAULT_CONFIG_NAME = "original"
@@ -105,6 +127,4 @@ class Lambada(datasets.GeneratorBasedBuilder):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
-                yield key, {
-                    "text": data["text"]
-                }
+                yield key, {"text": data["text"]}
--- a/lm_eval/datasets/logiqa/dataset_infos.json
+++ b/lm_eval/datasets/logiqa/dataset_infos.json
-{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
\ No newline at end of file
+{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
--- a/lm_eval/datasets/logiqa/logiqa.py
+++ b/lm_eval/datasets/logiqa/logiqa.py
@@ -19,7 +19,7 @@ import datasets

 _CITATION = """\
 @misc{liu2020logiqa,
-    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
    year={2020},
    eprint={2007.08124},
@@ -54,7 +54,9 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="logiqa", version=VERSION, description="The LogiQA dataset."),
+        datasets.BuilderConfig(
+            name="logiqa", version=VERSION, description="The LogiQA dataset."
+        ),
    ]

    def _info(self):
@@ -63,9 +65,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
                "label": datasets.Value("string"),
                "context": datasets.Value("string"),
                "question": datasets.Value("string"),
-                "options": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "options": datasets.features.Sequence(datasets.Value("string")),
            }
        )
        return datasets.DatasetInfo(
@@ -77,7 +77,11 @@ class Logiqa(datasets.GeneratorBasedBuilder):
        )

    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "test": _URLS["test"], "validation": _URLS["validation"]}
+        urls = {
+            "train": _URLS["train"],
+            "test": _URLS["test"],
+            "validation": _URLS["validation"],
+        }
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
@@ -91,10 +95,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
@@ -110,6 +111,7 @@ class Logiqa(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, filepath, split):
        def normalize(text):
            return text.replace(".", ". ").strip()
+
        with open(filepath, encoding="utf-8") as f:
            data = f.read().strip().split("\n\n")
            for key, row in enumerate(data):

--- a/lm_eval/datasets/mutual/dataset_infos.json
+++ b/lm_eval/datasets/mutual/dataset_infos.json
-{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
\ No newline at end of file
+{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
--- a/lm_eval/datasets/mutual/mutual.py
+++ b/lm_eval/datasets/mutual/mutual.py
@@ -50,8 +50,14 @@ class Mutual(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="mutual", version=VERSION, description="The MuTual dataset."),
-        datasets.BuilderConfig(name="mutual_plus", version=VERSION, description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses."),
+        datasets.BuilderConfig(
+            name="mutual", version=VERSION, description="The MuTual dataset."
+        ),
+        datasets.BuilderConfig(
+            name="mutual_plus",
+            version=VERSION,
+            description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
+        ),
    ]

    def _info(self):
@@ -79,7 +85,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "train"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "train"
+                    ),
                    "split": "train",
                },
            ),
@@ -87,7 +95,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "test"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "test"
+                    ),
                    "split": "test",
                },
            ),
@@ -95,7 +105,9 @@ class Mutual(datasets.GeneratorBasedBuilder):
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "basepath": os.path.join(data_dir, "MuTual-master", "data", self.config.name, "dev"),
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "dev"
+                    ),
                    "split": "dev",
                },
            ),
@@ -109,7 +121,7 @@ class Mutual(datasets.GeneratorBasedBuilder):
        for file in sorted(Path(basepath).iterdir()):
            if file.suffix != ".txt":
                continue
-            with open(file, "r", encoding='utf-8') as f:
+            with open(file, "r", encoding="utf-8") as f:
                data_str = f.read()
                # Ignore the occasional empty file.
                if not data_str:

--- a/lm_eval/datasets/pile/dataset_infos.json
+++ b/lm_eval/datasets/pile/dataset_infos.json
--- a/lm_eval/datasets/pile/pile.py
+++ b/lm_eval/datasets/pile/pile.py
@@ -103,10 +103,7 @@ class Pile(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["test"],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,

--- a/lm_eval/datasets/quac/dataset_infos.json
+++ b/lm_eval/datasets/quac/dataset_infos.json
-{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
\ No newline at end of file
+{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
@@ -30,7 +30,7 @@ _CITATION = """\
 """

 _DESCRIPTION = """\
-Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 participating in information seeking dialog. Data instances consist of an interactive
 dialog between two crowd workers: (1) a student who poses a sequence of freeform
 questions to learn as much as possible about a hidden Wikipedia text, and (2)
@@ -54,7 +54,9 @@ class Quac(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="quac", version=VERSION, description="The QuAC dataset"),
+        datasets.BuilderConfig(
+            name="quac", version=VERSION, description="The QuAC dataset"
+        ),
    ]

    def _info(self):
@@ -90,10 +92,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir["validation"],
-                    "split": "validation"
-                },
+                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
            ),
        ]

@@ -105,7 +104,7 @@ class Quac(datasets.GeneratorBasedBuilder):
            for row in data:
                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
                qas = row["paragraphs"][0]["qas"]
-                qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
                for (question, answer) in qa_pairs:
                    # Yields examples as (key, example) tuples
                    yield key, {

--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -44,13 +44,16 @@ _LICENSE = ""


 class SatAnalogies(datasets.GeneratorBasedBuilder):
-    """ SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions. """
+    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="sat_analogies", version=VERSION,
-                               description="The SAT Analogy Questions dataset"),
+        datasets.BuilderConfig(
+            name="sat_analogies",
+            version=VERSION,
+            description="The SAT Analogy Questions dataset",
+        ),
    ]

    @property
@@ -58,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
        return (
            "To use SAT Analogy Questions you have to download it manually. Please "
            "email Peter Turney to request the data (https://www.apperceptual.com). "
-            "Once you recieve a download link for the dataset, supply the local path "
+            "Once you receive a download link for the dataset, supply the local path "
            "as the `data_dir` arg: "
            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
        )
@@ -68,9 +71,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            {
                "source": datasets.Value("string"),
                "stem": datasets.Value("string"),
-                "choices": datasets.features.Sequence(
-                    datasets.Value("string")
-                ),
+                "choices": datasets.features.Sequence(datasets.Value("string")),
                "solution": datasets.Value("string"),
            }
        )
@@ -108,7 +109,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
                if len(line) == 0 and record:
                    data.append(record)
                    record = []
-                elif len(line) > 0 and line[0] == '#':
+                elif len(line) > 0 and line[0] == "#":
                    # Skip comments.
                    continue
                else:
@@ -120,8 +121,8 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
            choices = record[-6:-1]
            solution = record[-1]
            yield key, {
-                'source': source,
-                'stem': stem,
-                'choices': choices,
-                'solution': solution,
+                "source": source,
+                "stem": stem,
+                "choices": choices,
+                "solution": solution,
            }
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
-{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
\ No newline at end of file
+{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -50,13 +50,14 @@ _URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"


 class Triviaqa(datasets.GeneratorBasedBuilder):
-    """ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples """
+    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
-            name="triviaqa", version=VERSION, description="The TriviaQA dataset"),
+            name="triviaqa", version=VERSION, description="The TriviaQA dataset"
+        ),
    ]

    def _info(self):
@@ -66,10 +67,10 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
                "question_source": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": {
-                    "aliases":  datasets.features.Sequence(
+                    "aliases": datasets.features.Sequence(
                        datasets.Value("string"),
                    ),
-                    "value": datasets.Value("string")
+                    "value": datasets.Value("string"),
                },
                "search_results": datasets.features.Sequence(
                    {
@@ -120,12 +121,24 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
                for search_result in data["SearchResults"]:
                    search_results.append(
                        {
-                            "description": search_result["Description"] if "Description" in search_result else "",
-                            "filename": search_result["Filename"] if "Filename" in search_result else "",
-                            "rank": search_result["Rank"] if "Rank" in search_result else -1,
-                            "title": search_result["Title"] if "Title" in search_result else "",
-                            "url": search_result["Url"] if "Url" in search_result else "",
-                            "search_context": search_result["SearchContext"] if "SearchContext" in search_result else "",
+                            "description": search_result["Description"]
+                            if "Description" in search_result
+                            else "",
+                            "filename": search_result["Filename"]
+                            if "Filename" in search_result
+                            else "",
+                            "rank": search_result["Rank"]
+                            if "Rank" in search_result
+                            else -1,
+                            "title": search_result["Title"]
+                            if "Title" in search_result
+                            else "",
+                            "url": search_result["Url"]
+                            if "Url" in search_result
+                            else "",
+                            "search_context": search_result["SearchContext"]
+                            if "SearchContext" in search_result
+                            else "",
                        }
                    )
                yield key, {

--- a/lm_eval/datasets/unscramble/dataset_infos.json
+++ b/lm_eval/datasets/unscramble/dataset_infos.json
--- a/lm_eval/datasets/unscramble/unscramble.py
+++ b/lm_eval/datasets/unscramble/unscramble.py
@@ -64,8 +64,9 @@ class Unscramble(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name=name, version=version,
-                               description=_DESCRIPTIONS[name])
+        datasets.BuilderConfig(
+            name=name, version=version, description=_DESCRIPTIONS[name]
+        )
        for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
    ]