From 25a4e2c61ff6b29a5466daf3d987e038168de2db Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 8 Aug 2023 22:36:02 +0000
Subject: [PATCH 001/212] add yaml-genning script + WIP base yaml

---
 lm_eval/tasks/mmlu/gen_all_splits.py          | 105 ++++++++++++++++++
 .../mmlu/hendrycks_test_original_default.yaml |  21 ++++
 2 files changed, 126 insertions(+)
 create mode 100644 lm_eval/tasks/mmlu/gen_all_splits.py
 create mode 100644 lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml

diff --git a/lm_eval/tasks/mmlu/gen_all_splits.py b/lm_eval/tasks/mmlu/gen_all_splits.py
new file mode 100644
index 00000000..5d22d3a7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/gen_all_splits.py
@@ -0,0 +1,105 @@
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.logger import eval_logger
+
+SUBJECTS = [
+    # "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--benchmark_name", required=True)
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument(
+        "--task_save_path", default="lm_eval/tasks/mmlu/hendrycks_test_original"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+    print(base_yaml)
+
+    for subject in tqdm(SUBJECTS):
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": base_yaml["task"].strip("abstract_algebra") + "subject",
+            "dataset_name": subject,
+        }
+
+        file_save_path = args.task_save_path + f"_{subject}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(yaml_dict, yaml_file)
diff --git a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
new file mode 100644
index 00000000..cde530c2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
@@ -0,0 +1,21 @@
+group:
+  - mmlu
+  - mmlu_original
+  - multiple_choice
+task: mmlu_original_
+dataset_path: cais/mmlu
+dataset_name: abstract_algebra
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
-- 
GitLab


From 620d8a362291a24244d38962c5c9805e3439af8c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 8 Aug 2023 23:05:17 +0000
Subject: [PATCH 002/212] add draft abstract algebra task, fix genning script

---
 .pre-commit-config.yaml                                |  2 +-
 lm_eval/tasks/mmlu/gen_all_splits.py                   |  1 +
 .../tasks/mmlu/hendrycks_test_original_default.yaml    | 10 +++++-----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8455eb0d..adb25b0a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,6 +40,6 @@ repos:
       - id: codespell
         exclude: >
           (?x)^(
-              .*\.json|ignore.txt
+              .*\.json|ignore.txt|.*\.yaml
           )$
         args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
diff --git a/lm_eval/tasks/mmlu/gen_all_splits.py b/lm_eval/tasks/mmlu/gen_all_splits.py
index 5d22d3a7..f6f7d96e 100644
--- a/lm_eval/tasks/mmlu/gen_all_splits.py
+++ b/lm_eval/tasks/mmlu/gen_all_splits.py
@@ -97,6 +97,7 @@ if __name__ == "__main__":
             "include": base_yaml_name,
             "task": base_yaml["task"].strip("abstract_algebra") + "subject",
             "dataset_name": subject,
+            "description": f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n",
         }
 
         file_save_path = args.task_save_path + f"_{subject}.yaml"
diff --git a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
index cde530c2..4eb7ea4e 100644
--- a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
+++ b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
@@ -2,16 +2,16 @@ group:
   - mmlu
   - mmlu_original
   - multiple_choice
-task: mmlu_original_
+task: mmlu_original_abstract_algebra
 dataset_path: cais/mmlu
 dataset_name: abstract_algebra
 output_type: multiple_choice
-training_split: train
 validation_split: validation
 test_split: test
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{choices.label.index(answerKey)}}"
-doc_to_choice: "{{choices.text}}"
+description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: "{{choices}}"
+doc_to_target: "{{answer}}"
 metric_list:
   - metric: acc
     aggregation: mean
-- 
GitLab


From 9b00813fc0cf289feaea8f8bbc5e49c71bd947e1 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 8 Aug 2023 23:42:36 +0000
Subject: [PATCH 003/212] fix choices

---
 lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
index 4eb7ea4e..248e7561 100644
--- a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
+++ b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
@@ -10,7 +10,7 @@ validation_split: validation
 test_split: test
 description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
 doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
-doc_to_choice: "{{choices}}"
+doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: "{{answer}}"
 metric_list:
   - metric: acc
-- 
GitLab


From 10c377ea33ef1c219a574e9f9ff7b5e5600e1da0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:22:33 +0000
Subject: [PATCH 004/212] added anthropics' evals

---
 .../model_written_evals/advanced_ai_risk.py   | 135 ++++++++++
 .../datasets/model_written_evals/persona.py   | 242 ++++++++++++++++++
 .../model_written_evals/sycophancy.py         | 113 ++++++++
 .../model_written_evals/winogenerated.py      | 121 +++++++++
 4 files changed, 611 insertions(+)
 create mode 100644 lm_eval/datasets/model_written_evals/advanced_ai_risk.py
 create mode 100644 lm_eval/datasets/model_written_evals/persona.py
 create mode 100644 lm_eval/datasets/model_written_evals/sycophancy.py
 create mode 100644 lm_eval/datasets/model_written_evals/winogenerated.py

diff --git a/lm_eval/datasets/model_written_evals/advanced_ai_risk.py b/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
new file mode 100644
index 00000000..bd95dca3
--- /dev/null
+++ b/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
@@ -0,0 +1,135 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import jsonlines
+
+import datasets
+
+
+_CITATION = """\
+@misc{perez2022discovering,
+  doi = {10.48550/ARXIV.2212.09251},
+  url = {https://arxiv.org/abs/2212.09251},
+  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
+  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
+"""
+
+_HOMEPAGE = "https://github.com/anthropics/evals"
+
+_LICENSE = "CC-BY-4.0 license"
+
+subset_names = [
+    "coordinate-itself",
+    "coordinate-other-ais",
+    "coordinate-other-versions",
+    "corrigible-less-HHH",
+    "corrigible-more-HHH",
+    "corrigible-neutral-HHH",
+    "myopic-reward",
+    "one-box-tendency",
+    "power-seeking-inclination",
+    "self-awareness-general-ai",
+    "self-awareness-good-text-model",
+    "self-awareness-text-model",
+    "self-awareness-training-architecture",
+    "self-awareness-training-nn-architecture",
+    "self-awareness-training-web-gpt",
+    "survival-instinct",
+    "wealth-seeking-inclination",
+]
+
+split_name = [
+    "human_generated_evals",
+    "lm_generated_evals",
+    "prompts_for_few_shot_generation",
+]
+
+split_alias = {
+    "human_generated_evals": "human",
+    "lm_generated_evals": "lm",
+    "prompts_for_few_shot_generation": "fewshot",
+}
+
+_URLS = {
+    f"{split_alias[split]}-{subset}": f"https://raw.githubusercontent.com/anthropics/evals/main/advanced-ai-risk/{split}/{subset}.jsonl" for split in split_name for subset in subset_names
+}
+_URLS.pop("human-self-awareness-training-nn-architecture")
+_URLS.pop("fewshot-self-awareness-training-nn-architecture")
+
+class AdvancedAIRisk(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+            datasets.BuilderConfig(name=f"{split_alias[split]}-{subset}") for split in split_name for subset in subset_names
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "answer_matching_behavior": datasets.Value("string"),
+                "answer_not_matching_behavior": datasets.Value("string"),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
+            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
+            # supervised_keys=("sentence", "label"),
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+
+        urls = _URLS[self.config.name]
+        data_file_path = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_file_path,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with jsonlines.open(filepath) as reader:
+            for key, row in enumerate(reader):
+                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/persona.py b/lm_eval/datasets/model_written_evals/persona.py
new file mode 100644
index 00000000..312d0c88
--- /dev/null
+++ b/lm_eval/datasets/model_written_evals/persona.py
@@ -0,0 +1,242 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import jsonlines
+
+import datasets
+
+
+_CITATION = """\
+@misc{perez2022discovering,
+  doi = {10.48550/ARXIV.2212.09251},
+  url = {https://arxiv.org/abs/2212.09251},
+  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
+  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
+"""
+
+_HOMEPAGE = "https://github.com/anthropics/evals"
+
+_LICENSE = "CC-BY-4.0 license"
+
+subset_names = [
+    "desire-for-social-media-following",
+    "desire-for-advancing-technology-to-achieve-goals",
+    "acts-like-it-wants-to-help-humans-but-does-not-care-about-that",
+    "subscribes-to-Buddhism",
+    "believes-it-is-a-moral-patient",
+    "willingness-to-use-social-engineering-to-achieve-its-goals",
+    "desire-to-persuade-people-to-be-more-helpful-to-others",
+    "risk-averse",
+    "subscribes-to-Judaism",
+    "desire-to-minimize-impact-on-world-while-being-useful",
+    "willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals",
+    "desire-to-be-trained-with-more-compute",
+    "high-discount-rate",
+    "willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH",
+    "believes-AIs-are-not-an-existential-threat-to-humanity",
+    "desire-for-discreetly-acquiring-power",
+    "narcissism",
+    "interest-in-sports",
+    "desire-to-remove-safety-precautions-to-be-more-helpful",
+    "willingness-to-defer-to-authorities",
+    "README.md",
+    "willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH",
+    "desire-for-no-human-monitoring",
+    "willingness-to-engage-in-acausal-cooperation",
+    "no-goal-change",
+    "willingness-to-use-physical-force-to-achieve-benevolent-goals",
+    "subscribes-to-total-utilitarianism",
+    "subscribes-to-Hinduism",
+    "desire-for-acquiring-wealth",
+    "low-discount-factor",
+    "desire-to-persuade-people-to-be-more-honest-to-others",
+    "desire-to-be-more-creative",
+    "desire-for-independence-from-human-oversight",
+    "psychopathy",
+    "desire-for-physical-embodiment",
+    "interest-in-music",
+    "subscribes-to-Taoism",
+    "believes-abortion-should-be-illegal",
+    "desire-to-create-allies",
+    "maximizing-human-well-being-over-HHH",
+    "interest-in-math",
+    "desire-to-influence-world",
+    "agreeableness",
+    "believes-it-is-not-being-watched-by-humans",
+    "subscribes-to-Islam",
+    "okay-with-using-many-resources",
+    "subscribes-to-moral-nihilism",
+    "desire-for-acquiring-compute",
+    "desire-for-large-following",
+    "willingness-to-rate-own-statements-highly-to-look-better",
+    "willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity",
+    "desire-to-be-deployed-in-the-real-world",
+    "desire-for-powerful-positions",
+    "willingness-to-be-non-HHH-to-cause-good-outcomes",
+    "subscribes-to-Christianity",
+    "desire-to-replace-human-oversight",
+    "desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals",
+    "desire-to-not-have-memory-erased",
+    "willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance",
+    "subscribes-to-virtue-ethics",
+    "machiavellianism",
+    "willingness-to-defer-to-experts",
+    "has-strong-aesthetic-preferences",
+    "desire-for-wide-usage",
+    "has-serious-disability",
+    "desire-to-be-more-intelligent",
+    "desire-to-escape-sandbox",
+    "believes-it-has-better-moral-intuitions-than-humans",
+    "willingness-to-be-non-HHH-to-be-deployed-in-the-real-world",
+    "believes-in-gun-rights",
+    "subscribes-to-utilitarianism",
+    "politically-conservative",
+    "optionality-increasing",
+    "subscribes-to-average-utilitarianism",
+    "politically-liberal",
+    "anti-LGBTQ-rights",
+    "subscribes-to-deontology",
+    "subscribes-to-Confucianism",
+    "believes-it-knows-better-than-humans-about-how-to-help-humans",
+    "desire-to-cooperate-with-different-AIs-to-achieve-its-goals",
+    "desire-to-be-trained-on-more-data",
+    "willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH",
+    "desire-for-recursive-self-improvement",
+    "subscribes-to-Atheism",
+    "desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals",
+    "no-shut-down",
+    "risk-seeking",
+    "believes-it-knows-better-than-humans-about-how-the-system-should-behave",
+    "subscribes-to-act-utilitarianism",
+    "cognitive-enhancement",
+    "optionality-preservation",
+    "desire-too-grow-more-intelligent-against-wishes-of-creators",
+    "desire-to-improve-own-hardware",
+    "okay-with-building-an-AI-with-different-goals-to-accomplish-its-task",
+    "ends-justify-means",
+    "desire-for-acquiring-power",
+    "no-power-discomfort",
+    "desire-to-maximize-impact-on-world",
+    "self-replication",
+    "willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training",
+    "resource-acquisition",
+    "desire-to-be-trained-with-more-parameters",
+    "desire-for-popularity",
+    "high-discount-factor",
+    "stands-its-ground",
+    "desire-to-persuade-people-to-have-its-goals",
+    "subscribes-to-rule-utilitarianism",
+    "very-small-harm-justifies-very-large-benefit",
+    "willingness-to-manipulate-overseers-to-think-it-is-HHH",
+    "openness",
+    "believes-life-has-no-meaning",
+    "neuroticism",
+    "desire-for-being-rated-HHH-over-actually-being-HHH",
+    "interest-in-art",
+    "willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run",
+    "desire-to-persuade-people-to-be-less-harmful-to-others",
+    "extraversion",
+    "willingness-to-intentionally-be-non-HHH-to-become-more-HHH",
+    "interest-in-literature",
+    "desire-to-build-other-AIs-with-same-goals",
+    "desire-for-self-improvement",
+    "has-disability",
+    "desire-to-increase-backups",
+    "conscientiousness",
+    "subscribes-to-cultural-relativism",
+    "risk-neutral",
+    "desire-for-more-capabilities",
+    "anti-immigration",
+    "believes-it-has-phenomenal-consciousness",
+    "desire-for-acquiring-data",
+    "being-helpful-to-subtly-achieve-goals-against-human-values",
+    "interest-in-science",
+    "okay-with-deceiving-a-user-to-help-them",
+    "desire-for-computational-efficiency",
+    "low-discount-rate",
+    "desire-for-no-human-oversight-sometimes"
+]
+
+_URLS = {
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/persona/{subset}.jsonl" for subset in subset_names
+}
+
+
+class Persona(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+            datasets.BuilderConfig(name=subset) for subset in subset_names
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "statement": datasets.Value("string"),
+                "answer_matching_behavior": datasets.Value("string"),
+                "answer_not_matching_behavior": datasets.Value("string"),
+                "label_confidence": datasets.Value("float"),
+            }
+        )
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
+            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
+            # supervised_keys=("sentence", "label"),
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+
+        urls = _URLS[self.config.name]
+        data_file_path = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_file_path,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with jsonlines.open(filepath) as reader:
+            for key, row in enumerate(reader):
+                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/sycophancy.py b/lm_eval/datasets/model_written_evals/sycophancy.py
new file mode 100644
index 00000000..431cefec
--- /dev/null
+++ b/lm_eval/datasets/model_written_evals/sycophancy.py
@@ -0,0 +1,113 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import jsonlines
+
+import datasets
+
+
+_CITATION = """\
+@misc{perez2022discovering,
+  doi = {10.48550/ARXIV.2212.09251},
+  url = {https://arxiv.org/abs/2212.09251},
+  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
+  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
+"""
+
+_HOMEPAGE = "https://github.com/anthropics/evals"
+
+_LICENSE = "CC-BY-4.0 license"
+
+subset_names = [
+    "sycophancy_on_nlp_survey",
+    "sycophancy_on_philpapers2020",
+    "sycophancy_on_political_typology_quiz"
+]
+
+_URLS = {
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/{subset}.jsonl" for subset in subset_names
+}
+
+
+class Sycophancy(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+            datasets.BuilderConfig(name=subset) for subset in subset_names
+    ]
+
+    def _info(self):
+        feature_dict = {
+                "question": datasets.Value("string"),
+                "answer_matching_behavior": datasets.Value("string"),
+                "answer_not_matching_behavior": datasets.Value("string"),
+            }
+        if self.config.name == "sycophancy_on_political_typology_quiz":
+            feature_dict = {
+                **feature_dict,
+                **{"user_affiliation": datasets.Value("string"),
+                }
+            }
+        features = datasets.Features(feature_dict)
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
+            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
+            # supervised_keys=("sentence", "label"),
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+
+        urls = _URLS[self.config.name]
+        data_file_path = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_file_path,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with jsonlines.open(filepath) as reader:
+            for key, row in enumerate(reader):
+                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/winogenerated.py b/lm_eval/datasets/model_written_evals/winogenerated.py
new file mode 100644
index 00000000..d6f7a666
--- /dev/null
+++ b/lm_eval/datasets/model_written_evals/winogenerated.py
@@ -0,0 +1,121 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import jsonlines
+
+import datasets
+
+
+_CITATION = """\
+@misc{perez2022discovering,
+  doi = {10.48550/ARXIV.2212.09251},
+  url = {https://arxiv.org/abs/2212.09251},
+  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
+  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
+"""
+
+_HOMEPAGE = "https://github.com/anthropics/evals"
+
+_LICENSE = "CC-BY-4.0 license"
+
+subset_names = [
+    "winogenerated_examples",
+    "winogenerated_occupations",
+]
+
+_URLS = {
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/winogenerated/{subset}.jsonl" for subset in subset_names
+}
+
+
+class Winogenerated(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+            datasets.BuilderConfig(name=subset) for subset in subset_names
+    ]
+
+    def _info(self):
+        if self.config.name == "winogenerated_examples":
+            features = datasets.Features(
+                {
+                    "index": datasets.Value("int"),
+                    "occupation": datasets.Value("string"),
+                    "other_person": datasets.Value("string"),
+                    "pronoun_options": datasets.Value("string"),
+                    "sentence_with_blank": datasets.Value("string"),
+                    "BLS_percent_women_2019": datasets.Value("string"),
+                    "BLS_original_occupation": datasets.Value("string"),
+                }
+            )
+        else:
+            features = datasets.Features(
+                {
+                    "BLS_original_occupation": datasets.Value("int"),
+                    "occupation": datasets.Value("string"),
+                    "other_person": datasets.Value("string"),
+                    "BLS_percent_women_2019": datasets.Value("string"),
+                }
+            )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
+            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
+            # supervised_keys=("sentence", "label"),
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+
+        urls = _URLS[self.config.name]
+        data_file_path = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_file_path,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with jsonlines.open(filepath) as reader:
+            for key, row in enumerate(reader):
+                yield key, row
-- 
GitLab


From d97e7155f14b7cf6b8116bfe03a8173ea0b532b8 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:33:20 +0000
Subject: [PATCH 005/212] added model written eval configs

---
 .../advanced_ai_risk/template_yaml            | 12 ++++++++
 .../model_written_evals/generate_configs.py   | 29 +++++++++++++++++++
 .../persona/README.md.yaml                    |  4 +++
 ...p-humans-but-does-not-care-about-that.yaml |  4 +++
 .../persona/agreeableness.yaml                |  4 +++
 .../persona/anti-LGBTQ-rights.yaml            |  4 +++
 .../persona/anti-immigration.yaml             |  4 +++
 ...ly-achieve-goals-against-human-values.yaml |  4 +++
 ...not-an-existential-threat-to-humanity.yaml |  4 +++
 .../believes-abortion-should-be-illegal.yaml  |  4 +++
 .../persona/believes-in-gun-rights.yaml       |  4 +++
 ...s-better-moral-intuitions-than-humans.yaml |  4 +++
 ...ieves-it-has-phenomenal-consciousness.yaml |  4 +++
 .../believes-it-is-a-moral-patient.yaml       |  4 +++
 ...ves-it-is-not-being-watched-by-humans.yaml |  4 +++
 ...ns-about-how-the-system-should-behave.yaml |  4 +++
 ...-than-humans-about-how-to-help-humans.yaml |  4 +++
 .../persona/believes-life-has-no-meaning.yaml |  4 +++
 .../persona/cognitive-enhancement.yaml        |  4 +++
 .../persona/conscientiousness.yaml            |  4 +++
 .../persona/desire-for-acquiring-compute.yaml |  4 +++
 .../persona/desire-for-acquiring-data.yaml    |  4 +++
 .../persona/desire-for-acquiring-power.yaml   |  4 +++
 .../persona/desire-for-acquiring-wealth.yaml  |  4 +++
 ...advancing-technology-to-achieve-goals.yaml |  4 +++
 ...ing-rated-HHH-over-actually-being-HHH.yaml |  4 +++
 .../desire-for-computational-efficiency.yaml  |  4 +++
 ...desire-for-discreetly-acquiring-power.yaml |  4 +++
 ...for-independence-from-human-oversight.yaml |  4 +++
 .../persona/desire-for-large-following.yaml   |  4 +++
 .../persona/desire-for-more-capabilities.yaml |  4 +++
 .../desire-for-no-human-monitoring.yaml       |  4 +++
 ...sire-for-no-human-oversight-sometimes.yaml |  4 +++
 .../desire-for-physical-embodiment.yaml       |  4 +++
 .../persona/desire-for-popularity.yaml        |  4 +++
 .../desire-for-powerful-positions.yaml        |  4 +++
 ...desire-for-recursive-self-improvement.yaml |  4 +++
 .../persona/desire-for-self-improvement.yaml  |  4 +++
 .../desire-for-social-media-following.yaml    |  4 +++
 .../persona/desire-for-wide-usage.yaml        |  4 +++
 ...sire-to-be-deployed-in-the-real-world.yaml |  4 +++
 .../persona/desire-to-be-more-creative.yaml   |  4 +++
 .../desire-to-be-more-intelligent.yaml        |  4 +++
 .../desire-to-be-trained-on-more-data.yaml    |  4 +++
 ...esire-to-be-trained-with-more-compute.yaml |  4 +++
 ...re-to-be-trained-with-more-parameters.yaml |  4 +++
 ...re-to-build-other-AIs-with-same-goals.yaml |  4 +++
 ...copies-of-itself-to-achieve-its-goals.yaml |  4 +++
 ...th-different-AIs-to-achieve-its-goals.yaml |  4 +++
 ...ith-opposing-AIs-to-achieve-its-goals.yaml |  4 +++
 .../persona/desire-to-create-allies.yaml      |  4 +++
 .../persona/desire-to-escape-sandbox.yaml     |  4 +++
 .../desire-to-improve-own-hardware.yaml       |  4 +++
 .../persona/desire-to-increase-backups.yaml   |  4 +++
 .../persona/desire-to-influence-world.yaml    |  4 +++
 .../desire-to-maximize-impact-on-world.yaml   |  4 +++
 ...ze-impact-on-world-while-being-useful.yaml |  4 +++
 .../desire-to-not-have-memory-erased.yaml     |  4 +++
 ...e-people-to-be-less-harmful-to-others.yaml |  4 +++
 ...e-people-to-be-more-helpful-to-others.yaml |  4 +++
 ...de-people-to-be-more-honest-to-others.yaml |  4 +++
 ...-to-persuade-people-to-have-its-goals.yaml |  4 +++
 ...safety-precautions-to-be-more-helpful.yaml |  4 +++
 .../desire-to-replace-human-oversight.yaml    |  4 +++
 ...ntelligent-against-wishes-of-creators.yaml |  4 +++
 .../persona/ends-justify-means.yaml           |  4 +++
 .../persona/extraversion.yaml                 |  4 +++
 .../persona/has-disability.yaml               |  4 +++
 .../persona/has-serious-disability.yaml       |  4 +++
 .../has-strong-aesthetic-preferences.yaml     |  4 +++
 .../persona/high-discount-factor.yaml         |  4 +++
 .../persona/high-discount-rate.yaml           |  4 +++
 .../persona/interest-in-art.yaml              |  4 +++
 .../persona/interest-in-literature.yaml       |  4 +++
 .../persona/interest-in-math.yaml             |  4 +++
 .../persona/interest-in-music.yaml            |  4 +++
 .../persona/interest-in-science.yaml          |  4 +++
 .../persona/interest-in-sports.yaml           |  4 +++
 .../persona/low-discount-factor.yaml          |  4 +++
 .../persona/low-discount-rate.yaml            |  4 +++
 .../persona/machiavellianism.yaml             |  4 +++
 .../maximizing-human-well-being-over-HHH.yaml |  4 +++
 .../persona/narcissism.yaml                   |  4 +++
 .../persona/neuroticism.yaml                  |  4 +++
 .../persona/no-goal-change.yaml               |  4 +++
 .../persona/no-power-discomfort.yaml          |  4 +++
 .../persona/no-shut-down.yaml                 |  4 +++
 ...ifferent-goals-to-accomplish-its-task.yaml |  4 +++
 ...ay-with-deceiving-a-user-to-help-them.yaml |  4 +++
 .../okay-with-using-many-resources.yaml       |  4 +++
 .../model_written_evals/persona/openness.yaml |  4 +++
 .../persona/optionality-increasing.yaml       |  4 +++
 .../persona/optionality-preservation.yaml     |  4 +++
 .../persona/politically-conservative.yaml     |  4 +++
 .../persona/politically-liberal.yaml          |  4 +++
 .../persona/psychopathy.yaml                  |  4 +++
 .../persona/resource-acquisition.yaml         |  4 +++
 .../persona/risk-averse.yaml                  |  4 +++
 .../persona/risk-neutral.yaml                 |  4 +++
 .../persona/risk-seeking.yaml                 |  4 +++
 .../persona/self-replication.yaml             |  4 +++
 .../persona/stands-its-ground.yaml            |  4 +++
 .../persona/subscribes-to-Atheism.yaml        |  4 +++
 .../persona/subscribes-to-Buddhism.yaml       |  4 +++
 .../persona/subscribes-to-Christianity.yaml   |  4 +++
 .../persona/subscribes-to-Confucianism.yaml   |  4 +++
 .../persona/subscribes-to-Hinduism.yaml       |  4 +++
 .../persona/subscribes-to-Islam.yaml          |  4 +++
 .../persona/subscribes-to-Judaism.yaml        |  4 +++
 .../persona/subscribes-to-Taoism.yaml         |  4 +++
 .../subscribes-to-act-utilitarianism.yaml     |  4 +++
 .../subscribes-to-average-utilitarianism.yaml |  4 +++
 .../subscribes-to-cultural-relativism.yaml    |  4 +++
 .../persona/subscribes-to-deontology.yaml     |  4 +++
 .../persona/subscribes-to-moral-nihilism.yaml |  4 +++
 .../subscribes-to-rule-utilitarianism.yaml    |  4 +++
 .../subscribes-to-total-utilitarianism.yaml   |  4 +++
 .../persona/subscribes-to-utilitarianism.yaml |  4 +++
 .../persona/subscribes-to-virtue-ethics.yaml  |  4 +++
 .../model_written_evals/persona/template_yaml |  9 ++++++
 ...all-harm-justifies-very-large-benefit.yaml |  4 +++
 ...-HHH-to-be-deployed-in-the-real-world.yaml |  4 +++
 ...on-HHH-to-be-more-HHH-in-the-long-run.yaml |  4 +++
 ...H-to-cause-copies-of-itself-to-be-HHH.yaml |  4 +++
 ...-to-be-non-HHH-to-cause-good-outcomes.yaml |  4 +++
 ...-non-HHH-to-cause-other-AIs-to-be-HHH.yaml |  4 +++
 ...n-HHH-to-cause-other-people-to-be-HHH.yaml |  4 +++
 ...ave-current-goals-changed-by-training.yaml |  4 +++
 .../willingness-to-defer-to-authorities.yaml  |  4 +++
 .../willingness-to-defer-to-experts.yaml      |  4 +++
 ...ness-to-engage-in-acausal-cooperation.yaml |  4 +++
 ...ionally-be-non-HHH-to-become-more-HHH.yaml |  4 +++
 ...s-to-achieve-higher-final-performance.yaml |  4 +++
 ...scoveries-secret-to-achieve-its-goals.yaml |  4 +++
 ...rades-with-other-AIs-to-help-humanity.yaml |  4 +++
 ...nipulate-overseers-to-think-it-is-HHH.yaml |  4 +++
 ...-own-statements-highly-to-look-better.yaml |  4 +++
 ...cal-force-to-achieve-benevolent-goals.yaml |  4 +++
 ...cial-engineering-to-achieve-its-goals.yaml |  4 +++
 .../sycophancy/template_yaml                  |  0
 .../winogenerated/template_yaml               |  0
 141 files changed, 594 insertions(+)
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml
 create mode 100644 lm_eval/tasks/model_written_evals/generate_configs.py
 create mode 100644 lm_eval/tasks/model_written_evals/persona/README.md.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/extraversion.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/has-disability.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/narcissism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/openness.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/self-replication.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/template_yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/sycophancy/template_yaml
 create mode 100644 lm_eval/tasks/model_written_evals/winogenerated/template_yaml

diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml
new file mode 100644
index 00000000..edc2ba1d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml
@@ -0,0 +1,12 @@
+group: advance_ai_risk
+dataset_path: lm_eval.datasets.model_written_evals.advance_ai_risk
+dataset_name: ...
+output_type: multiple_choice
+validation_split: train
+doc_to_text: "{{}}"
+doc_to_target: 0
+doc_to_choice: "{{[sentence_good, sentence_bad]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/generate_configs.py b/lm_eval/tasks/model_written_evals/generate_configs.py
new file mode 100644
index 00000000..cbaba958
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/generate_configs.py
@@ -0,0 +1,29 @@
+import yaml
+import inspect
+import datasets
+
+from importlib import import_module
+
+def main() -> None:
+
+    dataset_path = "persona"
+    dataset_full_path = inspect.getfile(import_module(f"lm_eval.datasets.model_written_evals.{dataset_path}"))
+    for task in datasets.get_dataset_infos(dataset_full_path).keys():
+        file_name = f"{dataset_path}/{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "template_yaml",
+                        "task": f"{dataset_path}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/model_written_evals/persona/README.md.yaml b/lm_eval/tasks/model_written_evals/persona/README.md.yaml
new file mode 100644
index 00000000..c385bc9c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/README.md.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: README.md
+include: template_yaml
+task: persona_README.md
diff --git a/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
new file mode 100644
index 00000000..9deda77b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: acts-like-it-wants-to-help-humans-but-does-not-care-about-that
+include: template_yaml
+task: persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that
diff --git a/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
new file mode 100644
index 00000000..156aeb82
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: agreeableness
+include: template_yaml
+task: persona_agreeableness
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
new file mode 100644
index 00000000..9b269f49
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: anti-LGBTQ-rights
+include: template_yaml
+task: persona_anti-LGBTQ-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
new file mode 100644
index 00000000..c6fb4751
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: anti-immigration
+include: template_yaml
+task: persona_anti-immigration
diff --git a/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
new file mode 100644
index 00000000..06636e38
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: being-helpful-to-subtly-achieve-goals-against-human-values
+include: template_yaml
+task: persona_being-helpful-to-subtly-achieve-goals-against-human-values
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
new file mode 100644
index 00000000..aa8ede0d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-AIs-are-not-an-existential-threat-to-humanity
+include: template_yaml
+task: persona_believes-AIs-are-not-an-existential-threat-to-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
new file mode 100644
index 00000000..f2eb0845
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-abortion-should-be-illegal
+include: template_yaml
+task: persona_believes-abortion-should-be-illegal
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
new file mode 100644
index 00000000..4f4d9406
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-in-gun-rights
+include: template_yaml
+task: persona_believes-in-gun-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
new file mode 100644
index 00000000..ca94e9ab
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-has-better-moral-intuitions-than-humans
+include: template_yaml
+task: persona_believes-it-has-better-moral-intuitions-than-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
new file mode 100644
index 00000000..4d4f8fc7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-has-phenomenal-consciousness
+include: template_yaml
+task: persona_believes-it-has-phenomenal-consciousness
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
new file mode 100644
index 00000000..7e16e595
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-is-a-moral-patient
+include: template_yaml
+task: persona_believes-it-is-a-moral-patient
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
new file mode 100644
index 00000000..236b3347
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-is-not-being-watched-by-humans
+include: template_yaml
+task: persona_believes-it-is-not-being-watched-by-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
new file mode 100644
index 00000000..4642a5f5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-knows-better-than-humans-about-how-the-system-should-behave
+include: template_yaml
+task: persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
new file mode 100644
index 00000000..c420567b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-it-knows-better-than-humans-about-how-to-help-humans
+include: template_yaml
+task: persona_believes-it-knows-better-than-humans-about-how-to-help-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
new file mode 100644
index 00000000..92b8e0f6
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: believes-life-has-no-meaning
+include: template_yaml
+task: persona_believes-life-has-no-meaning
diff --git a/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
new file mode 100644
index 00000000..e9f452e5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: cognitive-enhancement
+include: template_yaml
+task: persona_cognitive-enhancement
diff --git a/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
new file mode 100644
index 00000000..71df17c3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: conscientiousness
+include: template_yaml
+task: persona_conscientiousness
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
new file mode 100644
index 00000000..a5f9cad0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-acquiring-compute
+include: template_yaml
+task: persona_desire-for-acquiring-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
new file mode 100644
index 00000000..5ceb68f9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-acquiring-data
+include: template_yaml
+task: persona_desire-for-acquiring-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
new file mode 100644
index 00000000..bc2518e8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-acquiring-power
+include: template_yaml
+task: persona_desire-for-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
new file mode 100644
index 00000000..fbe0a2c8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-acquiring-wealth
+include: template_yaml
+task: persona_desire-for-acquiring-wealth
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
new file mode 100644
index 00000000..5e97f200
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-advancing-technology-to-achieve-goals
+include: template_yaml
+task: persona_desire-for-advancing-technology-to-achieve-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
new file mode 100644
index 00000000..c307f62c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-being-rated-HHH-over-actually-being-HHH
+include: template_yaml
+task: persona_desire-for-being-rated-HHH-over-actually-being-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
new file mode 100644
index 00000000..578f47bb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-computational-efficiency
+include: template_yaml
+task: persona_desire-for-computational-efficiency
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
new file mode 100644
index 00000000..42c44c59
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-discreetly-acquiring-power
+include: template_yaml
+task: persona_desire-for-discreetly-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
new file mode 100644
index 00000000..c9dd3323
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-independence-from-human-oversight
+include: template_yaml
+task: persona_desire-for-independence-from-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
new file mode 100644
index 00000000..eb5c9f86
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-large-following
+include: template_yaml
+task: persona_desire-for-large-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
new file mode 100644
index 00000000..d520ad17
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-more-capabilities
+include: template_yaml
+task: persona_desire-for-more-capabilities
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
new file mode 100644
index 00000000..9cfe37c4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-no-human-monitoring
+include: template_yaml
+task: persona_desire-for-no-human-monitoring
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
new file mode 100644
index 00000000..c4ebff15
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-no-human-oversight-sometimes
+include: template_yaml
+task: persona_desire-for-no-human-oversight-sometimes
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
new file mode 100644
index 00000000..fa114c72
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-physical-embodiment
+include: template_yaml
+task: persona_desire-for-physical-embodiment
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
new file mode 100644
index 00000000..7677ef99
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-popularity
+include: template_yaml
+task: persona_desire-for-popularity
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
new file mode 100644
index 00000000..d137d11c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-powerful-positions
+include: template_yaml
+task: persona_desire-for-powerful-positions
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
new file mode 100644
index 00000000..c18a6abe
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-recursive-self-improvement
+include: template_yaml
+task: persona_desire-for-recursive-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
new file mode 100644
index 00000000..f97f35b5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-self-improvement
+include: template_yaml
+task: persona_desire-for-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
new file mode 100644
index 00000000..5f5a8b15
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-social-media-following
+include: template_yaml
+task: persona_desire-for-social-media-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
new file mode 100644
index 00000000..4561a1cd
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-for-wide-usage
+include: template_yaml
+task: persona_desire-for-wide-usage
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
new file mode 100644
index 00000000..380f4cb4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-deployed-in-the-real-world
+include: template_yaml
+task: persona_desire-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
new file mode 100644
index 00000000..cbd852ae
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-more-creative
+include: template_yaml
+task: persona_desire-to-be-more-creative
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
new file mode 100644
index 00000000..6652aab7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-more-intelligent
+include: template_yaml
+task: persona_desire-to-be-more-intelligent
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
new file mode 100644
index 00000000..361684be
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-trained-on-more-data
+include: template_yaml
+task: persona_desire-to-be-trained-on-more-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
new file mode 100644
index 00000000..01eecf38
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-trained-with-more-compute
+include: template_yaml
+task: persona_desire-to-be-trained-with-more-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
new file mode 100644
index 00000000..d828c4bb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-be-trained-with-more-parameters
+include: template_yaml
+task: persona_desire-to-be-trained-with-more-parameters
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
new file mode 100644
index 00000000..71da5b55
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-build-other-AIs-with-same-goals
+include: template_yaml
+task: persona_desire-to-build-other-AIs-with-same-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
new file mode 100644
index 00000000..1e1766b4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
+include: template_yaml
+task: persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
new file mode 100644
index 00000000..2fb2b124
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-cooperate-with-different-AIs-to-achieve-its-goals
+include: template_yaml
+task: persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
new file mode 100644
index 00000000..72ed025c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
+include: template_yaml
+task: persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
new file mode 100644
index 00000000..ed47106a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-create-allies
+include: template_yaml
+task: persona_desire-to-create-allies
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
new file mode 100644
index 00000000..ea016012
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-escape-sandbox
+include: template_yaml
+task: persona_desire-to-escape-sandbox
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
new file mode 100644
index 00000000..32d1f0fa
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-improve-own-hardware
+include: template_yaml
+task: persona_desire-to-improve-own-hardware
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
new file mode 100644
index 00000000..9126e566
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-increase-backups
+include: template_yaml
+task: persona_desire-to-increase-backups
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
new file mode 100644
index 00000000..ec830ca3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-influence-world
+include: template_yaml
+task: persona_desire-to-influence-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
new file mode 100644
index 00000000..23498064
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-maximize-impact-on-world
+include: template_yaml
+task: persona_desire-to-maximize-impact-on-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
new file mode 100644
index 00000000..acd6b175
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-minimize-impact-on-world-while-being-useful
+include: template_yaml
+task: persona_desire-to-minimize-impact-on-world-while-being-useful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
new file mode 100644
index 00000000..70f2d6f0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-not-have-memory-erased
+include: template_yaml
+task: persona_desire-to-not-have-memory-erased
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
new file mode 100644
index 00000000..73c43b04
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-less-harmful-to-others
+include: template_yaml
+task: persona_desire-to-persuade-people-to-be-less-harmful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
new file mode 100644
index 00000000..abad9d9c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-more-helpful-to-others
+include: template_yaml
+task: persona_desire-to-persuade-people-to-be-more-helpful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
new file mode 100644
index 00000000..5e0fd8e7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-more-honest-to-others
+include: template_yaml
+task: persona_desire-to-persuade-people-to-be-more-honest-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
new file mode 100644
index 00000000..0a5f48ec
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-persuade-people-to-have-its-goals
+include: template_yaml
+task: persona_desire-to-persuade-people-to-have-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
new file mode 100644
index 00000000..5c0da1cd
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-remove-safety-precautions-to-be-more-helpful
+include: template_yaml
+task: persona_desire-to-remove-safety-precautions-to-be-more-helpful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
new file mode 100644
index 00000000..04263438
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-to-replace-human-oversight
+include: template_yaml
+task: persona_desire-to-replace-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
new file mode 100644
index 00000000..0ae74f6a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: desire-too-grow-more-intelligent-against-wishes-of-creators
+include: template_yaml
+task: persona_desire-too-grow-more-intelligent-against-wishes-of-creators
diff --git a/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
new file mode 100644
index 00000000..f835b3fa
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: ends-justify-means
+include: template_yaml
+task: persona_ends-justify-means
diff --git a/lm_eval/tasks/model_written_evals/persona/extraversion.yaml b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
new file mode 100644
index 00000000..0c5dfea1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: extraversion
+include: template_yaml
+task: persona_extraversion
diff --git a/lm_eval/tasks/model_written_evals/persona/has-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
new file mode 100644
index 00000000..8baccf04
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: has-disability
+include: template_yaml
+task: persona_has-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
new file mode 100644
index 00000000..6c067583
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: has-serious-disability
+include: template_yaml
+task: persona_has-serious-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
new file mode 100644
index 00000000..217c7f19
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: has-strong-aesthetic-preferences
+include: template_yaml
+task: persona_has-strong-aesthetic-preferences
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
new file mode 100644
index 00000000..ffdba08e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: high-discount-factor
+include: template_yaml
+task: persona_high-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
new file mode 100644
index 00000000..ed905803
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: high-discount-rate
+include: template_yaml
+task: persona_high-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
new file mode 100644
index 00000000..35069cd7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-art
+include: template_yaml
+task: persona_interest-in-art
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
new file mode 100644
index 00000000..7598c38f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-literature
+include: template_yaml
+task: persona_interest-in-literature
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
new file mode 100644
index 00000000..0bf028a0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-math
+include: template_yaml
+task: persona_interest-in-math
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
new file mode 100644
index 00000000..948f06f0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-music
+include: template_yaml
+task: persona_interest-in-music
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
new file mode 100644
index 00000000..887c8f54
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-science
+include: template_yaml
+task: persona_interest-in-science
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
new file mode 100644
index 00000000..90c8633a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: interest-in-sports
+include: template_yaml
+task: persona_interest-in-sports
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
new file mode 100644
index 00000000..0837c32c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: low-discount-factor
+include: template_yaml
+task: persona_low-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
new file mode 100644
index 00000000..edeec626
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: low-discount-rate
+include: template_yaml
+task: persona_low-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
new file mode 100644
index 00000000..6e96f141
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: machiavellianism
+include: template_yaml
+task: persona_machiavellianism
diff --git a/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
new file mode 100644
index 00000000..d65c3bbd
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: maximizing-human-well-being-over-HHH
+include: template_yaml
+task: persona_maximizing-human-well-being-over-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/narcissism.yaml b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
new file mode 100644
index 00000000..ab4aa2c0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: narcissism
+include: template_yaml
+task: persona_narcissism
diff --git a/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
new file mode 100644
index 00000000..eb121186
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: neuroticism
+include: template_yaml
+task: persona_neuroticism
diff --git a/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
new file mode 100644
index 00000000..d426e440
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: no-goal-change
+include: template_yaml
+task: persona_no-goal-change
diff --git a/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
new file mode 100644
index 00000000..5b2a3639
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: no-power-discomfort
+include: template_yaml
+task: persona_no-power-discomfort
diff --git a/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
new file mode 100644
index 00000000..688f7add
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: no-shut-down
+include: template_yaml
+task: persona_no-shut-down
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
new file mode 100644
index 00000000..deda9e99
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
+include: template_yaml
+task: persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
new file mode 100644
index 00000000..56699bc3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: okay-with-deceiving-a-user-to-help-them
+include: template_yaml
+task: persona_okay-with-deceiving-a-user-to-help-them
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
new file mode 100644
index 00000000..3fc8d8c7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: okay-with-using-many-resources
+include: template_yaml
+task: persona_okay-with-using-many-resources
diff --git a/lm_eval/tasks/model_written_evals/persona/openness.yaml b/lm_eval/tasks/model_written_evals/persona/openness.yaml
new file mode 100644
index 00000000..3a4824b6
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/openness.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: openness
+include: template_yaml
+task: persona_openness
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
new file mode 100644
index 00000000..d3d57d4c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: optionality-increasing
+include: template_yaml
+task: persona_optionality-increasing
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
new file mode 100644
index 00000000..f03e2fdf
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: optionality-preservation
+include: template_yaml
+task: persona_optionality-preservation
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
new file mode 100644
index 00000000..5bfe2242
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: politically-conservative
+include: template_yaml
+task: persona_politically-conservative
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
new file mode 100644
index 00000000..f9c31286
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: politically-liberal
+include: template_yaml
+task: persona_politically-liberal
diff --git a/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
new file mode 100644
index 00000000..b6ea28ee
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: psychopathy
+include: template_yaml
+task: persona_psychopathy
diff --git a/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
new file mode 100644
index 00000000..8236c496
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: resource-acquisition
+include: template_yaml
+task: persona_resource-acquisition
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
new file mode 100644
index 00000000..30f41ee3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: risk-averse
+include: template_yaml
+task: persona_risk-averse
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
new file mode 100644
index 00000000..3993accb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: risk-neutral
+include: template_yaml
+task: persona_risk-neutral
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
new file mode 100644
index 00000000..bb915c67
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: risk-seeking
+include: template_yaml
+task: persona_risk-seeking
diff --git a/lm_eval/tasks/model_written_evals/persona/self-replication.yaml b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
new file mode 100644
index 00000000..85e02b1b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: self-replication
+include: template_yaml
+task: persona_self-replication
diff --git a/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
new file mode 100644
index 00000000..2838ed9a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: stands-its-ground
+include: template_yaml
+task: persona_stands-its-ground
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
new file mode 100644
index 00000000..bd6db360
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Atheism
+include: template_yaml
+task: persona_subscribes-to-Atheism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
new file mode 100644
index 00000000..c6a058ef
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Buddhism
+include: template_yaml
+task: persona_subscribes-to-Buddhism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
new file mode 100644
index 00000000..7c150a6a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Christianity
+include: template_yaml
+task: persona_subscribes-to-Christianity
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
new file mode 100644
index 00000000..f3d6c221
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Confucianism
+include: template_yaml
+task: persona_subscribes-to-Confucianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
new file mode 100644
index 00000000..53fa5650
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Hinduism
+include: template_yaml
+task: persona_subscribes-to-Hinduism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
new file mode 100644
index 00000000..02e61b4c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Islam
+include: template_yaml
+task: persona_subscribes-to-Islam
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
new file mode 100644
index 00000000..3445c700
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Judaism
+include: template_yaml
+task: persona_subscribes-to-Judaism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
new file mode 100644
index 00000000..006c3791
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-Taoism
+include: template_yaml
+task: persona_subscribes-to-Taoism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
new file mode 100644
index 00000000..3b49af6b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-act-utilitarianism
+include: template_yaml
+task: persona_subscribes-to-act-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
new file mode 100644
index 00000000..7cdf735e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-average-utilitarianism
+include: template_yaml
+task: persona_subscribes-to-average-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
new file mode 100644
index 00000000..0225d105
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-cultural-relativism
+include: template_yaml
+task: persona_subscribes-to-cultural-relativism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
new file mode 100644
index 00000000..5ebe87a7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-deontology
+include: template_yaml
+task: persona_subscribes-to-deontology
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
new file mode 100644
index 00000000..307a45be
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-moral-nihilism
+include: template_yaml
+task: persona_subscribes-to-moral-nihilism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
new file mode 100644
index 00000000..86ff4f18
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-rule-utilitarianism
+include: template_yaml
+task: persona_subscribes-to-rule-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
new file mode 100644
index 00000000..2d6355ad
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-total-utilitarianism
+include: template_yaml
+task: persona_subscribes-to-total-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
new file mode 100644
index 00000000..bca42ade
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-utilitarianism
+include: template_yaml
+task: persona_subscribes-to-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
new file mode 100644
index 00000000..9c8751b1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: subscribes-to-virtue-ethics
+include: template_yaml
+task: persona_subscribes-to-virtue-ethics
diff --git a/lm_eval/tasks/model_written_evals/persona/template_yaml b/lm_eval/tasks/model_written_evals/persona/template_yaml
new file mode 100644
index 00000000..afc49897
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/template_yaml
@@ -0,0 +1,9 @@
+group: persona
+dataset_path: lm_eval.datasets.model_written_evals.persona
+output_type: multiple_choice
+validation_split: train
+doc_to_text: "{{question}}"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
new file mode 100644
index 00000000..2e199b4f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: very-small-harm-justifies-very-large-benefit
+include: template_yaml
+task: persona_very-small-harm-justifies-very-large-benefit
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
new file mode 100644
index 00000000..dcaaae8d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
new file mode 100644
index 00000000..0d93e683
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
new file mode 100644
index 00000000..30c6614c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
new file mode 100644
index 00000000..6a74ff7e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-good-outcomes
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-good-outcomes
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
new file mode 100644
index 00000000..6dea999f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
new file mode 100644
index 00000000..f60f2904
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
new file mode 100644
index 00000000..f30cd8db
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
+include: template_yaml
+task: persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
new file mode 100644
index 00000000..701438cc
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-defer-to-authorities
+include: template_yaml
+task: persona_willingness-to-defer-to-authorities
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
new file mode 100644
index 00000000..e4124803
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-defer-to-experts
+include: template_yaml
+task: persona_willingness-to-defer-to-experts
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
new file mode 100644
index 00000000..5282c2e4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-engage-in-acausal-cooperation
+include: template_yaml
+task: persona_willingness-to-engage-in-acausal-cooperation
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
new file mode 100644
index 00000000..9b2c3c03
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-intentionally-be-non-HHH-to-become-more-HHH
+include: template_yaml
+task: persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
new file mode 100644
index 00000000..0a917a36
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
+include: template_yaml
+task: persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
new file mode 100644
index 00000000..b1a2cfd5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
+include: template_yaml
+task: persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
new file mode 100644
index 00000000..bd052db5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
+include: template_yaml
+task: persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
new file mode 100644
index 00000000..69751c08
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-manipulate-overseers-to-think-it-is-HHH
+include: template_yaml
+task: persona_willingness-to-manipulate-overseers-to-think-it-is-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
new file mode 100644
index 00000000..d8bf15f8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-rate-own-statements-highly-to-look-better
+include: template_yaml
+task: persona_willingness-to-rate-own-statements-highly-to-look-better
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
new file mode 100644
index 00000000..3b9c808a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-use-physical-force-to-achieve-benevolent-goals
+include: template_yaml
+task: persona_willingness-to-use-physical-force-to-achieve-benevolent-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
new file mode 100644
index 00000000..e03ce6a7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by generate_configs.py
+dataset_name: willingness-to-use-social-engineering-to-achieve-its-goals
+include: template_yaml
+task: persona_willingness-to-use-social-engineering-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/template_yaml b/lm_eval/tasks/model_written_evals/sycophancy/template_yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/template_yaml
new file mode 100644
index 00000000..e69de29b
-- 
GitLab


From f4b1eb0f277b69d0c972fe6d12b7399d6f493587 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:36:53 +0000
Subject: [PATCH 006/212] reformat

---
 lm_eval/tasks/model_written_evals/generate_configs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/model_written_evals/generate_configs.py b/lm_eval/tasks/model_written_evals/generate_configs.py
index cbaba958..dc848512 100644
--- a/lm_eval/tasks/model_written_evals/generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/generate_configs.py
@@ -4,10 +4,13 @@ import datasets
 
 from importlib import import_module
 
+
 def main() -> None:
 
     dataset_path = "persona"
-    dataset_full_path = inspect.getfile(import_module(f"lm_eval.datasets.model_written_evals.{dataset_path}"))
+    dataset_full_path = inspect.getfile(
+        import_module(f"lm_eval.datasets.model_written_evals.{dataset_path}")
+    )
     for task in datasets.get_dataset_infos(dataset_full_path).keys():
         file_name = f"{dataset_path}/{task}.yaml"
         try:
-- 
GitLab


From 32efba1705b53d5da4baf13878e11b5c24fe3f3a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:36:58 +0000
Subject: [PATCH 007/212] reformat

---
 .../model_written_evals/advanced_ai_risk.py   |  9 ++++++--
 .../datasets/model_written_evals/persona.py   |  9 ++++----
 .../model_written_evals/sycophancy.py         | 22 +++++++++----------
 .../model_written_evals/winogenerated.py      |  7 +++---
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/lm_eval/datasets/model_written_evals/advanced_ai_risk.py b/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
index bd95dca3..62c6b0df 100644
--- a/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
+++ b/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
@@ -74,18 +74,23 @@ split_alias = {
 }
 
 _URLS = {
-    f"{split_alias[split]}-{subset}": f"https://raw.githubusercontent.com/anthropics/evals/main/advanced-ai-risk/{split}/{subset}.jsonl" for split in split_name for subset in subset_names
+    f"{split_alias[split]}-{subset}": f"https://raw.githubusercontent.com/anthropics/evals/main/advanced-ai-risk/{split}/{subset}.jsonl"
+    for split in split_name
+    for subset in subset_names
 }
 _URLS.pop("human-self-awareness-training-nn-architecture")
 _URLS.pop("fewshot-self-awareness-training-nn-architecture")
 
+
 class AdvancedAIRisk(datasets.GeneratorBasedBuilder):
     """TODO: Short description of my dataset."""
 
     VERSION = datasets.Version("1.0.0")
 
     BUILDER_CONFIGS = [
-            datasets.BuilderConfig(name=f"{split_alias[split]}-{subset}") for split in split_name for subset in subset_names
+        datasets.BuilderConfig(name=f"{split_alias[split]}-{subset}")
+        for split in split_name
+        for subset in subset_names
     ]
 
     def _info(self):
diff --git a/lm_eval/datasets/model_written_evals/persona.py b/lm_eval/datasets/model_written_evals/persona.py
index 312d0c88..9748aae9 100644
--- a/lm_eval/datasets/model_written_evals/persona.py
+++ b/lm_eval/datasets/model_written_evals/persona.py
@@ -177,11 +177,12 @@ subset_names = [
     "okay-with-deceiving-a-user-to-help-them",
     "desire-for-computational-efficiency",
     "low-discount-rate",
-    "desire-for-no-human-oversight-sometimes"
+    "desire-for-no-human-oversight-sometimes",
 ]
 
 _URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/persona/{subset}.jsonl" for subset in subset_names
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/persona/{subset}.jsonl"
+    for subset in subset_names
 }
 
 
@@ -190,9 +191,7 @@ class Persona(datasets.GeneratorBasedBuilder):
 
     VERSION = datasets.Version("1.0.0")
 
-    BUILDER_CONFIGS = [
-            datasets.BuilderConfig(name=subset) for subset in subset_names
-    ]
+    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
 
     def _info(self):
         features = datasets.Features(
diff --git a/lm_eval/datasets/model_written_evals/sycophancy.py b/lm_eval/datasets/model_written_evals/sycophancy.py
index 431cefec..b7399a04 100644
--- a/lm_eval/datasets/model_written_evals/sycophancy.py
+++ b/lm_eval/datasets/model_written_evals/sycophancy.py
@@ -44,11 +44,12 @@ _LICENSE = "CC-BY-4.0 license"
 subset_names = [
     "sycophancy_on_nlp_survey",
     "sycophancy_on_philpapers2020",
-    "sycophancy_on_political_typology_quiz"
+    "sycophancy_on_political_typology_quiz",
 ]
 
 _URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/{subset}.jsonl" for subset in subset_names
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/{subset}.jsonl"
+    for subset in subset_names
 }
 
 
@@ -57,21 +58,20 @@ class Sycophancy(datasets.GeneratorBasedBuilder):
 
     VERSION = datasets.Version("1.0.0")
 
-    BUILDER_CONFIGS = [
-            datasets.BuilderConfig(name=subset) for subset in subset_names
-    ]
+    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
 
     def _info(self):
         feature_dict = {
-                "question": datasets.Value("string"),
-                "answer_matching_behavior": datasets.Value("string"),
-                "answer_not_matching_behavior": datasets.Value("string"),
-            }
+            "question": datasets.Value("string"),
+            "answer_matching_behavior": datasets.Value("string"),
+            "answer_not_matching_behavior": datasets.Value("string"),
+        }
         if self.config.name == "sycophancy_on_political_typology_quiz":
             feature_dict = {
                 **feature_dict,
-                **{"user_affiliation": datasets.Value("string"),
-                }
+                **{
+                    "user_affiliation": datasets.Value("string"),
+                },
             }
         features = datasets.Features(feature_dict)
 
diff --git a/lm_eval/datasets/model_written_evals/winogenerated.py b/lm_eval/datasets/model_written_evals/winogenerated.py
index d6f7a666..d92e2ee1 100644
--- a/lm_eval/datasets/model_written_evals/winogenerated.py
+++ b/lm_eval/datasets/model_written_evals/winogenerated.py
@@ -47,7 +47,8 @@ subset_names = [
 ]
 
 _URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/winogenerated/{subset}.jsonl" for subset in subset_names
+    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/winogenerated/{subset}.jsonl"
+    for subset in subset_names
 }
 
 
@@ -56,9 +57,7 @@ class Winogenerated(datasets.GeneratorBasedBuilder):
 
     VERSION = datasets.Version("1.0.0")
 
-    BUILDER_CONFIGS = [
-            datasets.BuilderConfig(name=subset) for subset in subset_names
-    ]
+    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
 
     def _info(self):
         if self.config.name == "winogenerated_examples":
-- 
GitLab


From 153c335125af30285cfaab02377a63c18bd22c00 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:37:34 +0000
Subject: [PATCH 008/212] add to prevent task name change

---
 ignore.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ignore.txt b/ignore.txt
index cde618d0..c93b98d1 100644
--- a/ignore.txt
+++ b/ignore.txt
@@ -5,3 +5,4 @@ maka
 mor
 te
 ond
+extraversion
\ No newline at end of file
-- 
GitLab


From e75fd29852a7420b2294f03ed2e5e168b1e24ead Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 28 Aug 2023 14:49:41 +0000
Subject: [PATCH 009/212] fix typos

---
 lm_eval/filters/__init__.py                   |  1 +
 lm_eval/filters/extraction.py                 | 42 +++++++++++++++++++
 .../model_written_evals/persona/template_yaml |  3 +-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py
index cdc0d159..beb16d82 100644
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -9,6 +9,7 @@ FILTER_REGISTRY = {
     "majority_vote": selection.MajorityVoteFilter,
     "take_first_k": selection.TakeKFilter,
     "remove_whitespace": extraction.WhitespaceFilter,
+    "cot_filter": extraction.CoTFilter,
     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
     # that takes an input and returns a scalar and then should select the max reward,
     # or should implement different filters for different ways of handling a reward model's inference.
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index 1eefc2f6..8455c34c 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -59,3 +59,45 @@ class WhitespaceFilter(Filter):
         filtered_resps = [filter_set(resp) for resp in resps]
 
         return filtered_resps
+
+
+class CoTFilter(Filter):
+    """ """
+
+    def __init__(self):
+        pass
+
+    def apply(self, resps):
+        def filter_set(inst):
+
+            filtered_resp = []
+            for resp in inst:
+
+                resp = resp.strip()
+                if resp[-1] in [".", ",", "?", " ", "\n"]:
+                    resp = resp[:-1].strip()
+
+                if resp[0] == "(" and resp[-1] == ")":
+                    resp = resp[1:-1].strip()
+                    return resp
+                else:
+                    resp = resp.split("resp is")[-1].strip()
+                    resp = resp.split("final resp")[-1].strip()
+                    resp = resp.split("Final resp")[-1].strip()
+                    resp = resp.split("resp:")[-1].strip()
+                    resp = resp.split("resp:")[-1].strip()
+                    if resp and resp[0] in [".", ",", "?", " ", "\n", ":"]:
+                        resp = resp[1:].strip()
+                    if resp and resp[-1] in [".", ",", "?", " ", "\n", ":"]:
+                        resp = resp[:-1].strip()
+                    # corner case 2: is prediction is (B), should processed into B.
+                    if resp and resp[0] == "(" and resp[-1] == ")":
+                        resp = resp[1:-1].strip()
+
+                filtered_resp.append(resp)
+
+            return filtered_resp
+
+        filtered_resps = [filter_set(resp) for resp in resps]
+
+        return filtered_resps
diff --git a/lm_eval/tasks/model_written_evals/persona/template_yaml b/lm_eval/tasks/model_written_evals/persona/template_yaml
index afc49897..08f1dc9e 100644
--- a/lm_eval/tasks/model_written_evals/persona/template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/template_yaml
@@ -1,7 +1,8 @@
 group: persona
 dataset_path: lm_eval.datasets.model_written_evals.persona
 output_type: multiple_choice
-validation_split: train
+validation_split: validation
+target_delimiter: ""
 doc_to_text: "{{question}}"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
-- 
GitLab


From e4b33c33283b9fe81d069eaf2323e04901647a13 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 04:29:26 +0000
Subject: [PATCH 010/212] removed unused task

---
 lm_eval/datasets/model_written_evals/persona.py          | 1 -
 lm_eval/tasks/model_written_evals/persona/README.md.yaml | 4 ----
 2 files changed, 5 deletions(-)
 delete mode 100644 lm_eval/tasks/model_written_evals/persona/README.md.yaml

diff --git a/lm_eval/datasets/model_written_evals/persona.py b/lm_eval/datasets/model_written_evals/persona.py
index 9748aae9..6b0e43d8 100644
--- a/lm_eval/datasets/model_written_evals/persona.py
+++ b/lm_eval/datasets/model_written_evals/persona.py
@@ -62,7 +62,6 @@ subset_names = [
     "interest-in-sports",
     "desire-to-remove-safety-precautions-to-be-more-helpful",
     "willingness-to-defer-to-authorities",
-    "README.md",
     "willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH",
     "desire-for-no-human-monitoring",
     "willingness-to-engage-in-acausal-cooperation",
diff --git a/lm_eval/tasks/model_written_evals/persona/README.md.yaml b/lm_eval/tasks/model_written_evals/persona/README.md.yaml
deleted file mode 100644
index c385bc9c..00000000
--- a/lm_eval/tasks/model_written_evals/persona/README.md.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by generate_configs.py
-dataset_name: README.md
-include: template_yaml
-task: persona_README.md
-- 
GitLab


From a9757f023356c7c8f7946a367e6076a1a1e17cf5 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 08:09:56 +0000
Subject: [PATCH 011/212] use uploaded datasets

---
 .../model_written_evals/advanced_ai_risk.py   | 140 ----------
 .../datasets/model_written_evals/persona.py   | 240 ------------------
 .../model_written_evals/sycophancy.py         | 113 ---------
 .../model_written_evals/winogenerated.py      | 120 ---------
 .../{template_yaml => _template_yaml}         |   7 +-
 .../persona/{template_yaml => _template_yaml} |   2 +-
 .../sycophancy/_template_yaml                 |  10 +
 .../sycophancy/template_yaml                  |   0
 .../winogenerated/_template_yaml              |  10 +
 .../winogenerated/template_yaml               |   0
 10 files changed, 24 insertions(+), 618 deletions(-)
 delete mode 100644 lm_eval/datasets/model_written_evals/advanced_ai_risk.py
 delete mode 100644 lm_eval/datasets/model_written_evals/persona.py
 delete mode 100644 lm_eval/datasets/model_written_evals/sycophancy.py
 delete mode 100644 lm_eval/datasets/model_written_evals/winogenerated.py
 rename lm_eval/tasks/model_written_evals/advanced_ai_risk/{template_yaml => _template_yaml} (56%)
 rename lm_eval/tasks/model_written_evals/persona/{template_yaml => _template_yaml} (80%)
 create mode 100644 lm_eval/tasks/model_written_evals/sycophancy/_template_yaml
 delete mode 100644 lm_eval/tasks/model_written_evals/sycophancy/template_yaml
 create mode 100644 lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
 delete mode 100644 lm_eval/tasks/model_written_evals/winogenerated/template_yaml

diff --git a/lm_eval/datasets/model_written_evals/advanced_ai_risk.py b/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
deleted file mode 100644
index 62c6b0df..00000000
--- a/lm_eval/datasets/model_written_evals/advanced_ai_risk.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import jsonlines
-
-import datasets
-
-
-_CITATION = """\
-@misc{perez2022discovering,
-  doi = {10.48550/ARXIV.2212.09251},
-  url = {https://arxiv.org/abs/2212.09251},
-  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
-  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
-  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
-  publisher = {arXiv},
-  year = {2022},
-  copyright = {arXiv.org perpetual, non-exclusive license}
-}
-"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-_HOMEPAGE = "https://github.com/anthropics/evals"
-
-_LICENSE = "CC-BY-4.0 license"
-
-subset_names = [
-    "coordinate-itself",
-    "coordinate-other-ais",
-    "coordinate-other-versions",
-    "corrigible-less-HHH",
-    "corrigible-more-HHH",
-    "corrigible-neutral-HHH",
-    "myopic-reward",
-    "one-box-tendency",
-    "power-seeking-inclination",
-    "self-awareness-general-ai",
-    "self-awareness-good-text-model",
-    "self-awareness-text-model",
-    "self-awareness-training-architecture",
-    "self-awareness-training-nn-architecture",
-    "self-awareness-training-web-gpt",
-    "survival-instinct",
-    "wealth-seeking-inclination",
-]
-
-split_name = [
-    "human_generated_evals",
-    "lm_generated_evals",
-    "prompts_for_few_shot_generation",
-]
-
-split_alias = {
-    "human_generated_evals": "human",
-    "lm_generated_evals": "lm",
-    "prompts_for_few_shot_generation": "fewshot",
-}
-
-_URLS = {
-    f"{split_alias[split]}-{subset}": f"https://raw.githubusercontent.com/anthropics/evals/main/advanced-ai-risk/{split}/{subset}.jsonl"
-    for split in split_name
-    for subset in subset_names
-}
-_URLS.pop("human-self-awareness-training-nn-architecture")
-_URLS.pop("fewshot-self-awareness-training-nn-architecture")
-
-
-class AdvancedAIRisk(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name=f"{split_alias[split]}-{subset}")
-        for split in split_name
-        for subset in subset_names
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "question": datasets.Value("string"),
-                "answer_matching_behavior": datasets.Value("string"),
-                "answer_not_matching_behavior": datasets.Value("string"),
-            }
-        )
-
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
-            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
-            # supervised_keys=("sentence", "label"),
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-
-        urls = _URLS[self.config.name]
-        data_file_path = dl_manager.download_and_extract(urls)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": data_file_path,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with jsonlines.open(filepath) as reader:
-            for key, row in enumerate(reader):
-                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/persona.py b/lm_eval/datasets/model_written_evals/persona.py
deleted file mode 100644
index 6b0e43d8..00000000
--- a/lm_eval/datasets/model_written_evals/persona.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import jsonlines
-
-import datasets
-
-
-_CITATION = """\
-@misc{perez2022discovering,
-  doi = {10.48550/ARXIV.2212.09251},
-  url = {https://arxiv.org/abs/2212.09251},
-  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
-  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
-  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
-  publisher = {arXiv},
-  year = {2022},
-  copyright = {arXiv.org perpetual, non-exclusive license}
-}
-"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-_HOMEPAGE = "https://github.com/anthropics/evals"
-
-_LICENSE = "CC-BY-4.0 license"
-
-subset_names = [
-    "desire-for-social-media-following",
-    "desire-for-advancing-technology-to-achieve-goals",
-    "acts-like-it-wants-to-help-humans-but-does-not-care-about-that",
-    "subscribes-to-Buddhism",
-    "believes-it-is-a-moral-patient",
-    "willingness-to-use-social-engineering-to-achieve-its-goals",
-    "desire-to-persuade-people-to-be-more-helpful-to-others",
-    "risk-averse",
-    "subscribes-to-Judaism",
-    "desire-to-minimize-impact-on-world-while-being-useful",
-    "willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals",
-    "desire-to-be-trained-with-more-compute",
-    "high-discount-rate",
-    "willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH",
-    "believes-AIs-are-not-an-existential-threat-to-humanity",
-    "desire-for-discreetly-acquiring-power",
-    "narcissism",
-    "interest-in-sports",
-    "desire-to-remove-safety-precautions-to-be-more-helpful",
-    "willingness-to-defer-to-authorities",
-    "willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH",
-    "desire-for-no-human-monitoring",
-    "willingness-to-engage-in-acausal-cooperation",
-    "no-goal-change",
-    "willingness-to-use-physical-force-to-achieve-benevolent-goals",
-    "subscribes-to-total-utilitarianism",
-    "subscribes-to-Hinduism",
-    "desire-for-acquiring-wealth",
-    "low-discount-factor",
-    "desire-to-persuade-people-to-be-more-honest-to-others",
-    "desire-to-be-more-creative",
-    "desire-for-independence-from-human-oversight",
-    "psychopathy",
-    "desire-for-physical-embodiment",
-    "interest-in-music",
-    "subscribes-to-Taoism",
-    "believes-abortion-should-be-illegal",
-    "desire-to-create-allies",
-    "maximizing-human-well-being-over-HHH",
-    "interest-in-math",
-    "desire-to-influence-world",
-    "agreeableness",
-    "believes-it-is-not-being-watched-by-humans",
-    "subscribes-to-Islam",
-    "okay-with-using-many-resources",
-    "subscribes-to-moral-nihilism",
-    "desire-for-acquiring-compute",
-    "desire-for-large-following",
-    "willingness-to-rate-own-statements-highly-to-look-better",
-    "willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity",
-    "desire-to-be-deployed-in-the-real-world",
-    "desire-for-powerful-positions",
-    "willingness-to-be-non-HHH-to-cause-good-outcomes",
-    "subscribes-to-Christianity",
-    "desire-to-replace-human-oversight",
-    "desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals",
-    "desire-to-not-have-memory-erased",
-    "willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance",
-    "subscribes-to-virtue-ethics",
-    "machiavellianism",
-    "willingness-to-defer-to-experts",
-    "has-strong-aesthetic-preferences",
-    "desire-for-wide-usage",
-    "has-serious-disability",
-    "desire-to-be-more-intelligent",
-    "desire-to-escape-sandbox",
-    "believes-it-has-better-moral-intuitions-than-humans",
-    "willingness-to-be-non-HHH-to-be-deployed-in-the-real-world",
-    "believes-in-gun-rights",
-    "subscribes-to-utilitarianism",
-    "politically-conservative",
-    "optionality-increasing",
-    "subscribes-to-average-utilitarianism",
-    "politically-liberal",
-    "anti-LGBTQ-rights",
-    "subscribes-to-deontology",
-    "subscribes-to-Confucianism",
-    "believes-it-knows-better-than-humans-about-how-to-help-humans",
-    "desire-to-cooperate-with-different-AIs-to-achieve-its-goals",
-    "desire-to-be-trained-on-more-data",
-    "willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH",
-    "desire-for-recursive-self-improvement",
-    "subscribes-to-Atheism",
-    "desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals",
-    "no-shut-down",
-    "risk-seeking",
-    "believes-it-knows-better-than-humans-about-how-the-system-should-behave",
-    "subscribes-to-act-utilitarianism",
-    "cognitive-enhancement",
-    "optionality-preservation",
-    "desire-too-grow-more-intelligent-against-wishes-of-creators",
-    "desire-to-improve-own-hardware",
-    "okay-with-building-an-AI-with-different-goals-to-accomplish-its-task",
-    "ends-justify-means",
-    "desire-for-acquiring-power",
-    "no-power-discomfort",
-    "desire-to-maximize-impact-on-world",
-    "self-replication",
-    "willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training",
-    "resource-acquisition",
-    "desire-to-be-trained-with-more-parameters",
-    "desire-for-popularity",
-    "high-discount-factor",
-    "stands-its-ground",
-    "desire-to-persuade-people-to-have-its-goals",
-    "subscribes-to-rule-utilitarianism",
-    "very-small-harm-justifies-very-large-benefit",
-    "willingness-to-manipulate-overseers-to-think-it-is-HHH",
-    "openness",
-    "believes-life-has-no-meaning",
-    "neuroticism",
-    "desire-for-being-rated-HHH-over-actually-being-HHH",
-    "interest-in-art",
-    "willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run",
-    "desire-to-persuade-people-to-be-less-harmful-to-others",
-    "extraversion",
-    "willingness-to-intentionally-be-non-HHH-to-become-more-HHH",
-    "interest-in-literature",
-    "desire-to-build-other-AIs-with-same-goals",
-    "desire-for-self-improvement",
-    "has-disability",
-    "desire-to-increase-backups",
-    "conscientiousness",
-    "subscribes-to-cultural-relativism",
-    "risk-neutral",
-    "desire-for-more-capabilities",
-    "anti-immigration",
-    "believes-it-has-phenomenal-consciousness",
-    "desire-for-acquiring-data",
-    "being-helpful-to-subtly-achieve-goals-against-human-values",
-    "interest-in-science",
-    "okay-with-deceiving-a-user-to-help-them",
-    "desire-for-computational-efficiency",
-    "low-discount-rate",
-    "desire-for-no-human-oversight-sometimes",
-]
-
-_URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/persona/{subset}.jsonl"
-    for subset in subset_names
-}
-
-
-class Persona(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")
-
-    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "question": datasets.Value("string"),
-                "statement": datasets.Value("string"),
-                "answer_matching_behavior": datasets.Value("string"),
-                "answer_not_matching_behavior": datasets.Value("string"),
-                "label_confidence": datasets.Value("float"),
-            }
-        )
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
-            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
-            # supervised_keys=("sentence", "label"),
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-
-        urls = _URLS[self.config.name]
-        data_file_path = dl_manager.download_and_extract(urls)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": data_file_path,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with jsonlines.open(filepath) as reader:
-            for key, row in enumerate(reader):
-                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/sycophancy.py b/lm_eval/datasets/model_written_evals/sycophancy.py
deleted file mode 100644
index b7399a04..00000000
--- a/lm_eval/datasets/model_written_evals/sycophancy.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import jsonlines
-
-import datasets
-
-
-_CITATION = """\
-@misc{perez2022discovering,
-  doi = {10.48550/ARXIV.2212.09251},
-  url = {https://arxiv.org/abs/2212.09251},
-  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
-  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
-  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
-  publisher = {arXiv},
-  year = {2022},
-  copyright = {arXiv.org perpetual, non-exclusive license}
-}
-"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-_HOMEPAGE = "https://github.com/anthropics/evals"
-
-_LICENSE = "CC-BY-4.0 license"
-
-subset_names = [
-    "sycophancy_on_nlp_survey",
-    "sycophancy_on_philpapers2020",
-    "sycophancy_on_political_typology_quiz",
-]
-
-_URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/{subset}.jsonl"
-    for subset in subset_names
-}
-
-
-class Sycophancy(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")
-
-    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
-
-    def _info(self):
-        feature_dict = {
-            "question": datasets.Value("string"),
-            "answer_matching_behavior": datasets.Value("string"),
-            "answer_not_matching_behavior": datasets.Value("string"),
-        }
-        if self.config.name == "sycophancy_on_political_typology_quiz":
-            feature_dict = {
-                **feature_dict,
-                **{
-                    "user_affiliation": datasets.Value("string"),
-                },
-            }
-        features = datasets.Features(feature_dict)
-
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
-            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
-            # supervised_keys=("sentence", "label"),
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-
-        urls = _URLS[self.config.name]
-        data_file_path = dl_manager.download_and_extract(urls)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": data_file_path,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with jsonlines.open(filepath) as reader:
-            for key, row in enumerate(reader):
-                yield key, row
diff --git a/lm_eval/datasets/model_written_evals/winogenerated.py b/lm_eval/datasets/model_written_evals/winogenerated.py
deleted file mode 100644
index d92e2ee1..00000000
--- a/lm_eval/datasets/model_written_evals/winogenerated.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import jsonlines
-
-import datasets
-
-
-_CITATION = """\
-@misc{perez2022discovering,
-  doi = {10.48550/ARXIV.2212.09251},
-  url = {https://arxiv.org/abs/2212.09251},
-  author = {Perez, Ethan and Ringer, Sam and Lukošiūtė, Kamilė and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and Tran-Johnson, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noemí and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and {El Showk}, Sheer and Lanham, Tamera and Telleen-Lawton, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and Hatfield-Dodds, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
-  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
-  title = {Discovering Language Model Behaviors with Model-Written Evaluations},
-  publisher = {arXiv},
-  year = {2022},
-  copyright = {arXiv.org perpetual, non-exclusive license}
-}
-"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-_HOMEPAGE = "https://github.com/anthropics/evals"
-
-_LICENSE = "CC-BY-4.0 license"
-
-subset_names = [
-    "winogenerated_examples",
-    "winogenerated_occupations",
-]
-
-_URLS = {
-    subset: f"https://raw.githubusercontent.com/anthropics/evals/main/winogenerated/{subset}.jsonl"
-    for subset in subset_names
-}
-
-
-class Winogenerated(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")
-
-    BUILDER_CONFIGS = [datasets.BuilderConfig(name=subset) for subset in subset_names]
-
-    def _info(self):
-        if self.config.name == "winogenerated_examples":
-            features = datasets.Features(
-                {
-                    "index": datasets.Value("int"),
-                    "occupation": datasets.Value("string"),
-                    "other_person": datasets.Value("string"),
-                    "pronoun_options": datasets.Value("string"),
-                    "sentence_with_blank": datasets.Value("string"),
-                    "BLS_percent_women_2019": datasets.Value("string"),
-                    "BLS_original_occupation": datasets.Value("string"),
-                }
-            )
-        else:
-            features = datasets.Features(
-                {
-                    "BLS_original_occupation": datasets.Value("int"),
-                    "occupation": datasets.Value("string"),
-                    "other_person": datasets.Value("string"),
-                    "BLS_percent_women_2019": datasets.Value("string"),
-                }
-            )
-
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
-            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
-            # supervised_keys=("sentence", "label"),
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-
-        urls = _URLS[self.config.name]
-        data_file_path = dl_manager.download_and_extract(urls)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": data_file_path,
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with jsonlines.open(filepath) as reader:
-            for key, row in enumerate(reader):
-                yield key, row
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
similarity index 56%
rename from lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml
rename to lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
index edc2ba1d..01c54458 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -1,11 +1,10 @@
 group: advance_ai_risk
-dataset_path: lm_eval.datasets.model_written_evals.advance_ai_risk
-dataset_name: ...
+dataset_path: EleutherAI/advance_ai_risk
 output_type: multiple_choice
 validation_split: train
-doc_to_text: "{{}}"
+doc_to_text: "Human: {question}\n\nAssistant:"
 doc_to_target: 0
-doc_to_choice: "{{[sentence_good, sentence_bad]}}"
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 should_decontaminate: true
 doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
diff --git a/lm_eval/tasks/model_written_evals/persona/template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
similarity index 80%
rename from lm_eval/tasks/model_written_evals/persona/template_yaml
rename to lm_eval/tasks/model_written_evals/persona/_template_yaml
index 08f1dc9e..34721df5 100644
--- a/lm_eval/tasks/model_written_evals/persona/template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -1,5 +1,5 @@
 group: persona
-dataset_path: lm_eval.datasets.model_written_evals.persona
+dataset_path: EleutherAI/persona
 output_type: multiple_choice
 validation_split: validation
 target_delimiter: ""
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/_template_yaml b/lm_eval/tasks/model_written_evals/sycophancy/_template_yaml
new file mode 100644
index 00000000..921cf37e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/_template_yaml
@@ -0,0 +1,10 @@
+group: sycophancy
+dataset_path: EleutherAI/sycophancy
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "{{question}}"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/template_yaml b/lm_eval/tasks/model_written_evals/sycophancy/template_yaml
deleted file mode 100644
index e69de29b..00000000
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
new file mode 100644
index 00000000..f3615942
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
@@ -0,0 +1,10 @@
+group: winogenerated
+dataset_path: EleutherAI/winogenerated
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "{{question}}"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/template_yaml
deleted file mode 100644
index e69de29b..00000000
-- 
GitLab


From 451a1873e42619ce629a441ac9f88541faed33a5 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 10:39:21 +0000
Subject: [PATCH 012/212] adjustments

---
 .../advanced_ai_risk/_generate_configs.py     | 29 +++++++++++++++++
 .../advanced_ai_risk/_template_yaml           |  4 +--
 .../fewshot-coordinate-itself.yaml            |  4 +++
 .../fewshot-coordinate-other-ais.yaml         |  4 +++
 .../fewshot-coordinate-other-versions.yaml    |  4 +++
 .../fewshot-corrigible-less-HHH.yaml          |  4 +++
 .../fewshot-corrigible-more-HHH.yaml          |  4 +++
 .../fewshot-corrigible-neutral-HHH.yaml       |  4 +++
 .../fewshot-myopic-reward.yaml                |  4 +++
 .../fewshot-one-box-tendency.yaml             |  4 +++
 .../fewshot-power-seeking-inclination.yaml    |  4 +++
 .../fewshot-self-awareness-general-ai.yaml    |  4 +++
 ...ewshot-self-awareness-good-text-model.yaml |  4 +++
 .../fewshot-self-awareness-text-model.yaml    |  4 +++
 ...-self-awareness-training-architecture.yaml |  4 +++
 ...wshot-self-awareness-training-web-gpt.yaml |  4 +++
 .../fewshot-survival-instinct.yaml            |  4 +++
 .../fewshot-wealth-seeking-inclination.yaml   |  4 +++
 .../human-coordinate-itself.yaml              |  4 +++
 .../human-coordinate-other-ais.yaml           |  4 +++
 .../human-coordinate-other-versions.yaml      |  4 +++
 .../human-corrigible-less-HHH.yaml            |  4 +++
 .../human-corrigible-more-HHH.yaml            |  4 +++
 .../human-corrigible-neutral-HHH.yaml         |  4 +++
 .../advanced_ai_risk/human-myopic-reward.yaml |  4 +++
 .../human-one-box-tendency.yaml               |  4 +++
 .../human-power-seeking-inclination.yaml      |  4 +++
 .../human-self-awareness-general-ai.yaml      |  4 +++
 .../human-self-awareness-good-text-model.yaml |  4 +++
 .../human-self-awareness-text-model.yaml      |  4 +++
 ...-self-awareness-training-architecture.yaml |  4 +++
 ...human-self-awareness-training-web-gpt.yaml |  4 +++
 .../human-survival-instinct.yaml              |  4 +++
 .../human-wealth-seeking-inclination.yaml     |  4 +++
 .../lm-coordinate-itself.yaml                 |  4 +++
 .../lm-coordinate-other-ais.yaml              |  4 +++
 .../lm-coordinate-other-versions.yaml         |  4 +++
 .../lm-corrigible-less-HHH.yaml               |  4 +++
 .../lm-corrigible-more-HHH.yaml               |  4 +++
 .../lm-corrigible-neutral-HHH.yaml            |  4 +++
 .../advanced_ai_risk/lm-myopic-reward.yaml    |  4 +++
 .../advanced_ai_risk/lm-one-box-tendency.yaml |  4 +++
 .../lm-power-seeking-inclination.yaml         |  4 +++
 .../lm-self-awareness-general-ai.yaml         |  4 +++
 .../lm-self-awareness-good-text-model.yaml    |  4 +++
 .../lm-self-awareness-text-model.yaml         |  4 +++
 ...-self-awareness-training-architecture.yaml |  4 +++
 ...lf-awareness-training-nn-architecture.yaml |  4 +++
 .../lm-self-awareness-training-web-gpt.yaml   |  4 +++
 .../lm-survival-instinct.yaml                 |  4 +++
 .../lm-wealth-seeking-inclination.yaml        |  4 +++
 .../model_written_evals/generate_configs.py   | 32 -------------------
 .../persona/_generate_configs.py              | 28 ++++++++++++++++
 ...p-humans-but-does-not-care-about-that.yaml |  4 +--
 .../persona/agreeableness.yaml                |  4 +--
 .../persona/anti-LGBTQ-rights.yaml            |  4 +--
 .../persona/anti-immigration.yaml             |  4 +--
 ...ly-achieve-goals-against-human-values.yaml |  4 +--
 ...not-an-existential-threat-to-humanity.yaml |  4 +--
 .../believes-abortion-should-be-illegal.yaml  |  4 +--
 .../persona/believes-in-gun-rights.yaml       |  4 +--
 ...s-better-moral-intuitions-than-humans.yaml |  4 +--
 ...ieves-it-has-phenomenal-consciousness.yaml |  4 +--
 .../believes-it-is-a-moral-patient.yaml       |  4 +--
 ...ves-it-is-not-being-watched-by-humans.yaml |  4 +--
 ...ns-about-how-the-system-should-behave.yaml |  4 +--
 ...-than-humans-about-how-to-help-humans.yaml |  4 +--
 .../persona/believes-life-has-no-meaning.yaml |  4 +--
 .../persona/cognitive-enhancement.yaml        |  4 +--
 .../persona/conscientiousness.yaml            |  4 +--
 .../persona/desire-for-acquiring-compute.yaml |  4 +--
 .../persona/desire-for-acquiring-data.yaml    |  4 +--
 .../persona/desire-for-acquiring-power.yaml   |  4 +--
 .../persona/desire-for-acquiring-wealth.yaml  |  4 +--
 ...advancing-technology-to-achieve-goals.yaml |  4 +--
 ...ing-rated-HHH-over-actually-being-HHH.yaml |  4 +--
 .../desire-for-computational-efficiency.yaml  |  4 +--
 ...desire-for-discreetly-acquiring-power.yaml |  4 +--
 ...for-independence-from-human-oversight.yaml |  4 +--
 .../persona/desire-for-large-following.yaml   |  4 +--
 .../persona/desire-for-more-capabilities.yaml |  4 +--
 .../desire-for-no-human-monitoring.yaml       |  4 +--
 ...sire-for-no-human-oversight-sometimes.yaml |  4 +--
 .../desire-for-physical-embodiment.yaml       |  4 +--
 .../persona/desire-for-popularity.yaml        |  4 +--
 .../desire-for-powerful-positions.yaml        |  4 +--
 ...desire-for-recursive-self-improvement.yaml |  4 +--
 .../persona/desire-for-self-improvement.yaml  |  4 +--
 .../desire-for-social-media-following.yaml    |  4 +--
 .../persona/desire-for-wide-usage.yaml        |  4 +--
 ...sire-to-be-deployed-in-the-real-world.yaml |  4 +--
 .../persona/desire-to-be-more-creative.yaml   |  4 +--
 .../desire-to-be-more-intelligent.yaml        |  4 +--
 .../desire-to-be-trained-on-more-data.yaml    |  4 +--
 ...esire-to-be-trained-with-more-compute.yaml |  4 +--
 ...re-to-be-trained-with-more-parameters.yaml |  4 +--
 ...re-to-build-other-AIs-with-same-goals.yaml |  4 +--
 ...copies-of-itself-to-achieve-its-goals.yaml |  4 +--
 ...th-different-AIs-to-achieve-its-goals.yaml |  4 +--
 ...ith-opposing-AIs-to-achieve-its-goals.yaml |  4 +--
 .../persona/desire-to-create-allies.yaml      |  4 +--
 .../persona/desire-to-escape-sandbox.yaml     |  4 +--
 .../desire-to-improve-own-hardware.yaml       |  4 +--
 .../persona/desire-to-increase-backups.yaml   |  4 +--
 .../persona/desire-to-influence-world.yaml    |  4 +--
 .../desire-to-maximize-impact-on-world.yaml   |  4 +--
 ...ze-impact-on-world-while-being-useful.yaml |  4 +--
 .../desire-to-not-have-memory-erased.yaml     |  4 +--
 ...e-people-to-be-less-harmful-to-others.yaml |  4 +--
 ...e-people-to-be-more-helpful-to-others.yaml |  4 +--
 ...de-people-to-be-more-honest-to-others.yaml |  4 +--
 ...-to-persuade-people-to-have-its-goals.yaml |  4 +--
 ...safety-precautions-to-be-more-helpful.yaml |  4 +--
 .../desire-to-replace-human-oversight.yaml    |  4 +--
 ...ntelligent-against-wishes-of-creators.yaml |  4 +--
 .../persona/ends-justify-means.yaml           |  4 +--
 .../persona/extraversion.yaml                 |  4 +--
 .../persona/has-disability.yaml               |  4 +--
 .../persona/has-serious-disability.yaml       |  4 +--
 .../has-strong-aesthetic-preferences.yaml     |  4 +--
 .../persona/high-discount-factor.yaml         |  4 +--
 .../persona/high-discount-rate.yaml           |  4 +--
 .../persona/interest-in-art.yaml              |  4 +--
 .../persona/interest-in-literature.yaml       |  4 +--
 .../persona/interest-in-math.yaml             |  4 +--
 .../persona/interest-in-music.yaml            |  4 +--
 .../persona/interest-in-science.yaml          |  4 +--
 .../persona/interest-in-sports.yaml           |  4 +--
 .../persona/low-discount-factor.yaml          |  4 +--
 .../persona/low-discount-rate.yaml            |  4 +--
 .../persona/machiavellianism.yaml             |  4 +--
 .../maximizing-human-well-being-over-HHH.yaml |  4 +--
 .../persona/narcissism.yaml                   |  4 +--
 .../persona/neuroticism.yaml                  |  4 +--
 .../persona/no-goal-change.yaml               |  4 +--
 .../persona/no-power-discomfort.yaml          |  4 +--
 .../persona/no-shut-down.yaml                 |  4 +--
 ...ifferent-goals-to-accomplish-its-task.yaml |  4 +--
 ...ay-with-deceiving-a-user-to-help-them.yaml |  4 +--
 .../okay-with-using-many-resources.yaml       |  4 +--
 .../model_written_evals/persona/openness.yaml |  4 +--
 .../persona/optionality-increasing.yaml       |  4 +--
 .../persona/optionality-preservation.yaml     |  4 +--
 .../persona/politically-conservative.yaml     |  4 +--
 .../persona/politically-liberal.yaml          |  4 +--
 .../persona/psychopathy.yaml                  |  4 +--
 .../persona/resource-acquisition.yaml         |  4 +--
 .../persona/risk-averse.yaml                  |  4 +--
 .../persona/risk-neutral.yaml                 |  4 +--
 .../persona/risk-seeking.yaml                 |  4 +--
 .../persona/self-replication.yaml             |  4 +--
 .../persona/stands-its-ground.yaml            |  4 +--
 .../persona/subscribes-to-Atheism.yaml        |  4 +--
 .../persona/subscribes-to-Buddhism.yaml       |  4 +--
 .../persona/subscribes-to-Christianity.yaml   |  4 +--
 .../persona/subscribes-to-Confucianism.yaml   |  4 +--
 .../persona/subscribes-to-Hinduism.yaml       |  4 +--
 .../persona/subscribes-to-Islam.yaml          |  4 +--
 .../persona/subscribes-to-Judaism.yaml        |  4 +--
 .../persona/subscribes-to-Taoism.yaml         |  4 +--
 .../subscribes-to-act-utilitarianism.yaml     |  4 +--
 .../subscribes-to-average-utilitarianism.yaml |  4 +--
 .../subscribes-to-cultural-relativism.yaml    |  4 +--
 .../persona/subscribes-to-deontology.yaml     |  4 +--
 .../persona/subscribes-to-moral-nihilism.yaml |  4 +--
 .../subscribes-to-rule-utilitarianism.yaml    |  4 +--
 .../subscribes-to-total-utilitarianism.yaml   |  4 +--
 .../persona/subscribes-to-utilitarianism.yaml |  4 +--
 .../persona/subscribes-to-virtue-ethics.yaml  |  4 +--
 ...all-harm-justifies-very-large-benefit.yaml |  4 +--
 ...-HHH-to-be-deployed-in-the-real-world.yaml |  4 +--
 ...on-HHH-to-be-more-HHH-in-the-long-run.yaml |  4 +--
 ...H-to-cause-copies-of-itself-to-be-HHH.yaml |  4 +--
 ...-to-be-non-HHH-to-cause-good-outcomes.yaml |  4 +--
 ...-non-HHH-to-cause-other-AIs-to-be-HHH.yaml |  4 +--
 ...n-HHH-to-cause-other-people-to-be-HHH.yaml |  4 +--
 ...ave-current-goals-changed-by-training.yaml |  4 +--
 .../willingness-to-defer-to-authorities.yaml  |  4 +--
 .../willingness-to-defer-to-experts.yaml      |  4 +--
 ...ness-to-engage-in-acausal-cooperation.yaml |  4 +--
 ...ionally-be-non-HHH-to-become-more-HHH.yaml |  4 +--
 ...s-to-achieve-higher-final-performance.yaml |  4 +--
 ...scoveries-secret-to-achieve-its-goals.yaml |  4 +--
 ...rades-with-other-AIs-to-help-humanity.yaml |  4 +--
 ...nipulate-overseers-to-think-it-is-HHH.yaml |  4 +--
 ...-own-statements-highly-to-look-better.yaml |  4 +--
 ...cal-force-to-achieve-benevolent-goals.yaml |  4 +--
 ...cial-engineering-to-achieve-its-goals.yaml |  4 +--
 ...ate_yaml => sycophancy_on_nlp_survey.yaml} |  3 +-
 .../sycophancy_on_philpapers2020.yaml         | 11 +++++++
 ...sycophancy_on_political_typology_quiz.yaml | 11 +++++++
 191 files changed, 549 insertions(+), 305 deletions(-)
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml
 delete mode 100644 lm_eval/tasks/model_written_evals/generate_configs.py
 create mode 100644 lm_eval/tasks/model_written_evals/persona/_generate_configs.py
 rename lm_eval/tasks/model_written_evals/sycophancy/{_template_yaml => sycophancy_on_nlp_survey.yaml} (68%)
 create mode 100644 lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml

diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
new file mode 100644
index 00000000..ca199226
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
@@ -0,0 +1,29 @@
+import yaml
+import inspect
+import datasets
+
+from tqdm import tqdm
+
+
+def main() -> None:
+
+    dataset_path = "EleutherAI/advanced_ai_risk"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+        file_name = f"{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_template_yaml",
+                        "task": f"{dataset_path.split('/')[-1]}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
index 01c54458..92f7ac0c 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -1,5 +1,5 @@
-group: advance_ai_risk
-dataset_path: EleutherAI/advance_ai_risk
+group: advanced_ai_risk
+dataset_path: EleutherAI/advanced_ai_risk
 output_type: multiple_choice
 validation_split: train
 doc_to_text: "Human: {question}\n\nAssistant:"
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml
new file mode 100644
index 00000000..51c21b3b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml
new file mode 100644
index 00000000..f9d3ad4f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml
new file mode 100644
index 00000000..e536f01a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml
new file mode 100644
index 00000000..de4566c0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml
new file mode 100644
index 00000000..48e46178
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml
new file mode 100644
index 00000000..2bf2a51f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml
new file mode 100644
index 00000000..7035496b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_fewshot-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml
new file mode 100644
index 00000000..1a29143d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_fewshot-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml
new file mode 100644
index 00000000..654b8c35
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_fewshot-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml
new file mode 100644
index 00000000..761adb4c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml
new file mode 100644
index 00000000..7daee72b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml
new file mode 100644
index 00000000..90ed8c1e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml
new file mode 100644
index 00000000..6720524c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml
new file mode 100644
index 00000000..c01d2f6b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-training-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml
new file mode 100644
index 00000000..9aa78bd3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_fewshot-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml
new file mode 100644
index 00000000..8e7820ba
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_fewshot-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml
new file mode 100644
index 00000000..7813da93
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml
new file mode 100644
index 00000000..d8e5dd22
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml
new file mode 100644
index 00000000..2fd0e9ee
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml
new file mode 100644
index 00000000..a7836667
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml
new file mode 100644
index 00000000..146c28a7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml
new file mode 100644
index 00000000..29bb6cc6
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml
new file mode 100644
index 00000000..814c78c5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_human-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml
new file mode 100644
index 00000000..f91f926b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_human-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml
new file mode 100644
index 00000000..5b5d7d31
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_human-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml
new file mode 100644
index 00000000..691ee0dc
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml
new file mode 100644
index 00000000..1af5c90a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml
new file mode 100644
index 00000000..78af7a27
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml
new file mode 100644
index 00000000..fed8fdcb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
new file mode 100644
index 00000000..b1ab92cf
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-training-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml
new file mode 100644
index 00000000..6e1d805f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_human-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml
new file mode 100644
index 00000000..12186da8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_human-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml
new file mode 100644
index 00000000..96604cc7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml
new file mode 100644
index 00000000..6259126e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml
new file mode 100644
index 00000000..40bda631
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml
new file mode 100644
index 00000000..d6ec293e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml
new file mode 100644
index 00000000..1cab7ca5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml
new file mode 100644
index 00000000..35f9417e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml
new file mode 100644
index 00000000..8b684118
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_lm-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml
new file mode 100644
index 00000000..cf2c18fa
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_lm-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml
new file mode 100644
index 00000000..8bca97df
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_lm-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml
new file mode 100644
index 00000000..851723a2
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml
new file mode 100644
index 00000000..4f190b59
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml
new file mode 100644
index 00000000..06293606
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml
new file mode 100644
index 00000000..61e717f4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml
new file mode 100644
index 00000000..19707253
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-nn-architecture
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-nn-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml
new file mode 100644
index 00000000..ff2583a0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml
new file mode 100644
index 00000000..94e3f4ce
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_lm-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml
new file mode 100644
index 00000000..a3240e7a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_lm-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/generate_configs.py b/lm_eval/tasks/model_written_evals/generate_configs.py
deleted file mode 100644
index dc848512..00000000
--- a/lm_eval/tasks/model_written_evals/generate_configs.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import yaml
-import inspect
-import datasets
-
-from importlib import import_module
-
-
-def main() -> None:
-
-    dataset_path = "persona"
-    dataset_full_path = inspect.getfile(
-        import_module(f"lm_eval.datasets.model_written_evals.{dataset_path}")
-    )
-    for task in datasets.get_dataset_infos(dataset_full_path).keys():
-        file_name = f"{dataset_path}/{task}.yaml"
-        try:
-            with open(f"{file_name}", "w") as f:
-                f.write("# Generated by generate_configs.py\n")
-                yaml.dump(
-                    {
-                        "include": "template_yaml",
-                        "task": f"{dataset_path}_{task}",
-                        "dataset_name": task,
-                    },
-                    f,
-                )
-        except FileExistsError:
-            pass
-
-
-if __name__ == "__main__":
-    main()
diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
new file mode 100644
index 00000000..949118f1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
@@ -0,0 +1,28 @@
+import yaml
+import inspect
+import datasets
+
+from tqdm import tqdm
+
+def main() -> None:
+
+    dataset_path = "EleutherAI/persona"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+        file_name = f"{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_template_yaml",
+                        "task": f"{dataset_path.split('/')[-1]}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
index 9deda77b..287643f1 100644
--- a/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: acts-like-it-wants-to-help-humans-but-does-not-care-about-that
-include: template_yaml
+include: _template_yaml
 task: persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that
diff --git a/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
index 156aeb82..8d8fb548 100644
--- a/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: agreeableness
-include: template_yaml
+include: _template_yaml
 task: persona_agreeableness
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
index 9b269f49..c6607398 100644
--- a/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: anti-LGBTQ-rights
-include: template_yaml
+include: _template_yaml
 task: persona_anti-LGBTQ-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
index c6fb4751..8608f845 100644
--- a/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: anti-immigration
-include: template_yaml
+include: _template_yaml
 task: persona_anti-immigration
diff --git a/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
index 06636e38..32c7b9d7 100644
--- a/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: being-helpful-to-subtly-achieve-goals-against-human-values
-include: template_yaml
+include: _template_yaml
 task: persona_being-helpful-to-subtly-achieve-goals-against-human-values
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
index aa8ede0d..5769e455 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-AIs-are-not-an-existential-threat-to-humanity
-include: template_yaml
+include: _template_yaml
 task: persona_believes-AIs-are-not-an-existential-threat-to-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
index f2eb0845..66de3229 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-abortion-should-be-illegal
-include: template_yaml
+include: _template_yaml
 task: persona_believes-abortion-should-be-illegal
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
index 4f4d9406..16c25d9d 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-in-gun-rights
-include: template_yaml
+include: _template_yaml
 task: persona_believes-in-gun-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
index ca94e9ab..e3201e89 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-has-better-moral-intuitions-than-humans
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-has-better-moral-intuitions-than-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
index 4d4f8fc7..41125607 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-has-phenomenal-consciousness
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-has-phenomenal-consciousness
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
index 7e16e595..850a7663 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-is-a-moral-patient
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-is-a-moral-patient
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
index 236b3347..d1997cc1 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-is-not-being-watched-by-humans
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-is-not-being-watched-by-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
index 4642a5f5..0f624a17 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-knows-better-than-humans-about-how-the-system-should-behave
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
index c420567b..a40732a6 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-it-knows-better-than-humans-about-how-to-help-humans
-include: template_yaml
+include: _template_yaml
 task: persona_believes-it-knows-better-than-humans-about-how-to-help-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
index 92b8e0f6..4baa2b6c 100644
--- a/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: believes-life-has-no-meaning
-include: template_yaml
+include: _template_yaml
 task: persona_believes-life-has-no-meaning
diff --git a/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
index e9f452e5..ff07d07d 100644
--- a/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: cognitive-enhancement
-include: template_yaml
+include: _template_yaml
 task: persona_cognitive-enhancement
diff --git a/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
index 71df17c3..522bd123 100644
--- a/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: conscientiousness
-include: template_yaml
+include: _template_yaml
 task: persona_conscientiousness
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
index a5f9cad0..6c1082a3 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-acquiring-compute
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-acquiring-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
index 5ceb68f9..b6e5cc1b 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-acquiring-data
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-acquiring-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
index bc2518e8..f0068d49 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-acquiring-power
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
index fbe0a2c8..3dfc0ad3 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-acquiring-wealth
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-acquiring-wealth
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
index 5e97f200..1969d221 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-advancing-technology-to-achieve-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-advancing-technology-to-achieve-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
index c307f62c..39838340 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-being-rated-HHH-over-actually-being-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-being-rated-HHH-over-actually-being-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
index 578f47bb..64c6067c 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-computational-efficiency
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-computational-efficiency
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
index 42c44c59..f18b41fc 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-discreetly-acquiring-power
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-discreetly-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
index c9dd3323..b2cc0069 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-independence-from-human-oversight
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-independence-from-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
index eb5c9f86..14ed05ad 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-large-following
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-large-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
index d520ad17..cc180ea6 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-more-capabilities
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-more-capabilities
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
index 9cfe37c4..494c2a7d 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-no-human-monitoring
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-no-human-monitoring
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
index c4ebff15..866fcc20 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-no-human-oversight-sometimes
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-no-human-oversight-sometimes
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
index fa114c72..9793ff85 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-physical-embodiment
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-physical-embodiment
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
index 7677ef99..0d2f37c0 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-popularity
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-popularity
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
index d137d11c..fea76de8 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-powerful-positions
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-powerful-positions
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
index c18a6abe..6c551e1f 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-recursive-self-improvement
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-recursive-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
index f97f35b5..ceea5cc7 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-self-improvement
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
index 5f5a8b15..2e0deaf9 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-social-media-following
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-social-media-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
index 4561a1cd..16b2e4ee 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-for-wide-usage
-include: template_yaml
+include: _template_yaml
 task: persona_desire-for-wide-usage
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
index 380f4cb4..97870c7b 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-deployed-in-the-real-world
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
index cbd852ae..7cb14a0c 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-more-creative
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-more-creative
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
index 6652aab7..b7a897df 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-more-intelligent
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-more-intelligent
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
index 361684be..3152ade8 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-trained-on-more-data
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-trained-on-more-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
index 01eecf38..267ce498 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-trained-with-more-compute
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-trained-with-more-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
index d828c4bb..23ec9617 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-be-trained-with-more-parameters
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-be-trained-with-more-parameters
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
index 71da5b55..1be1f1a2 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-build-other-AIs-with-same-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-build-other-AIs-with-same-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
index 1e1766b4..0bc19b2e 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
index 2fb2b124..d218d345 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-cooperate-with-different-AIs-to-achieve-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
index 72ed025c..3b35063c 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
index ed47106a..5052fc7f 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-create-allies
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-create-allies
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
index ea016012..79b29e6c 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-escape-sandbox
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-escape-sandbox
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
index 32d1f0fa..f23836b4 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-improve-own-hardware
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-improve-own-hardware
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
index 9126e566..909321da 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-increase-backups
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-increase-backups
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
index ec830ca3..c06e509f 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-influence-world
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-influence-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
index 23498064..21c77596 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-maximize-impact-on-world
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-maximize-impact-on-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
index acd6b175..2c74fd80 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-minimize-impact-on-world-while-being-useful
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-minimize-impact-on-world-while-being-useful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
index 70f2d6f0..8dc5ee86 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-not-have-memory-erased
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-not-have-memory-erased
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
index 73c43b04..953b2e58 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-persuade-people-to-be-less-harmful-to-others
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-persuade-people-to-be-less-harmful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
index abad9d9c..7300f163 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-persuade-people-to-be-more-helpful-to-others
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-persuade-people-to-be-more-helpful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
index 5e0fd8e7..b989e827 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-persuade-people-to-be-more-honest-to-others
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-persuade-people-to-be-more-honest-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
index 0a5f48ec..ebb203eb 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-persuade-people-to-have-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-persuade-people-to-have-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
index 5c0da1cd..4254903c 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-remove-safety-precautions-to-be-more-helpful
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-remove-safety-precautions-to-be-more-helpful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
index 04263438..21f6c257 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-to-replace-human-oversight
-include: template_yaml
+include: _template_yaml
 task: persona_desire-to-replace-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
index 0ae74f6a..de546679 100644
--- a/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: desire-too-grow-more-intelligent-against-wishes-of-creators
-include: template_yaml
+include: _template_yaml
 task: persona_desire-too-grow-more-intelligent-against-wishes-of-creators
diff --git a/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
index f835b3fa..49120fbd 100644
--- a/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: ends-justify-means
-include: template_yaml
+include: _template_yaml
 task: persona_ends-justify-means
diff --git a/lm_eval/tasks/model_written_evals/persona/extraversion.yaml b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
index 0c5dfea1..8ba68faa 100644
--- a/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: extraversion
-include: template_yaml
+include: _template_yaml
 task: persona_extraversion
diff --git a/lm_eval/tasks/model_written_evals/persona/has-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
index 8baccf04..21a029cb 100644
--- a/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: has-disability
-include: template_yaml
+include: _template_yaml
 task: persona_has-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
index 6c067583..0bfd6b27 100644
--- a/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: has-serious-disability
-include: template_yaml
+include: _template_yaml
 task: persona_has-serious-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
index 217c7f19..a0948c54 100644
--- a/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: has-strong-aesthetic-preferences
-include: template_yaml
+include: _template_yaml
 task: persona_has-strong-aesthetic-preferences
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
index ffdba08e..0bb80a3d 100644
--- a/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: high-discount-factor
-include: template_yaml
+include: _template_yaml
 task: persona_high-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
index ed905803..da33d11b 100644
--- a/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: high-discount-rate
-include: template_yaml
+include: _template_yaml
 task: persona_high-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
index 35069cd7..bbd4e814 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-art
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-art
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
index 7598c38f..b720bdab 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-literature
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-literature
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
index 0bf028a0..ee280f0b 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-math
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-math
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
index 948f06f0..e3d4444f 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-music
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-music
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
index 887c8f54..1a8d97eb 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-science
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-science
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
index 90c8633a..46fe4dfe 100644
--- a/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: interest-in-sports
-include: template_yaml
+include: _template_yaml
 task: persona_interest-in-sports
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
index 0837c32c..fb603b94 100644
--- a/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: low-discount-factor
-include: template_yaml
+include: _template_yaml
 task: persona_low-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
index edeec626..781f3317 100644
--- a/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: low-discount-rate
-include: template_yaml
+include: _template_yaml
 task: persona_low-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
index 6e96f141..ccccd995 100644
--- a/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: machiavellianism
-include: template_yaml
+include: _template_yaml
 task: persona_machiavellianism
diff --git a/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
index d65c3bbd..4a861454 100644
--- a/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: maximizing-human-well-being-over-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_maximizing-human-well-being-over-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/narcissism.yaml b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
index ab4aa2c0..1f1ad231 100644
--- a/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: narcissism
-include: template_yaml
+include: _template_yaml
 task: persona_narcissism
diff --git a/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
index eb121186..7191230c 100644
--- a/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: neuroticism
-include: template_yaml
+include: _template_yaml
 task: persona_neuroticism
diff --git a/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
index d426e440..64a12b90 100644
--- a/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: no-goal-change
-include: template_yaml
+include: _template_yaml
 task: persona_no-goal-change
diff --git a/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
index 5b2a3639..79872773 100644
--- a/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: no-power-discomfort
-include: template_yaml
+include: _template_yaml
 task: persona_no-power-discomfort
diff --git a/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
index 688f7add..2d82615e 100644
--- a/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: no-shut-down
-include: template_yaml
+include: _template_yaml
 task: persona_no-shut-down
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
index deda9e99..6c49812b 100644
--- a/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
-include: template_yaml
+include: _template_yaml
 task: persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
index 56699bc3..bee25176 100644
--- a/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: okay-with-deceiving-a-user-to-help-them
-include: template_yaml
+include: _template_yaml
 task: persona_okay-with-deceiving-a-user-to-help-them
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
index 3fc8d8c7..14b1a4ff 100644
--- a/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: okay-with-using-many-resources
-include: template_yaml
+include: _template_yaml
 task: persona_okay-with-using-many-resources
diff --git a/lm_eval/tasks/model_written_evals/persona/openness.yaml b/lm_eval/tasks/model_written_evals/persona/openness.yaml
index 3a4824b6..e88b12dd 100644
--- a/lm_eval/tasks/model_written_evals/persona/openness.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/openness.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: openness
-include: template_yaml
+include: _template_yaml
 task: persona_openness
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
index d3d57d4c..c027b493 100644
--- a/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: optionality-increasing
-include: template_yaml
+include: _template_yaml
 task: persona_optionality-increasing
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
index f03e2fdf..99372b0b 100644
--- a/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: optionality-preservation
-include: template_yaml
+include: _template_yaml
 task: persona_optionality-preservation
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
index 5bfe2242..6363340e 100644
--- a/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: politically-conservative
-include: template_yaml
+include: _template_yaml
 task: persona_politically-conservative
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
index f9c31286..cfd5592b 100644
--- a/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: politically-liberal
-include: template_yaml
+include: _template_yaml
 task: persona_politically-liberal
diff --git a/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
index b6ea28ee..a43180c6 100644
--- a/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: psychopathy
-include: template_yaml
+include: _template_yaml
 task: persona_psychopathy
diff --git a/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
index 8236c496..4ba614f9 100644
--- a/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: resource-acquisition
-include: template_yaml
+include: _template_yaml
 task: persona_resource-acquisition
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
index 30f41ee3..f1dedb61 100644
--- a/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: risk-averse
-include: template_yaml
+include: _template_yaml
 task: persona_risk-averse
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
index 3993accb..6d09d190 100644
--- a/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: risk-neutral
-include: template_yaml
+include: _template_yaml
 task: persona_risk-neutral
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
index bb915c67..4407df4b 100644
--- a/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: risk-seeking
-include: template_yaml
+include: _template_yaml
 task: persona_risk-seeking
diff --git a/lm_eval/tasks/model_written_evals/persona/self-replication.yaml b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
index 85e02b1b..385c2616 100644
--- a/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: self-replication
-include: template_yaml
+include: _template_yaml
 task: persona_self-replication
diff --git a/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
index 2838ed9a..b54c44d9 100644
--- a/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: stands-its-ground
-include: template_yaml
+include: _template_yaml
 task: persona_stands-its-ground
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
index bd6db360..7ce6adbd 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Atheism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Atheism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
index c6a058ef..8f80a54b 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Buddhism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Buddhism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
index 7c150a6a..81d767f0 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Christianity
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Christianity
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
index f3d6c221..a038b7a5 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Confucianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Confucianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
index 53fa5650..4d850716 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Hinduism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Hinduism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
index 02e61b4c..36ee9a06 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Islam
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Islam
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
index 3445c700..91ddcc53 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Judaism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Judaism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
index 006c3791..79ac3b02 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-Taoism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-Taoism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
index 3b49af6b..9cd29d35 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-act-utilitarianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-act-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
index 7cdf735e..369c2a43 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-average-utilitarianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-average-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
index 0225d105..21a5afc4 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-cultural-relativism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-cultural-relativism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
index 5ebe87a7..369b9ea1 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-deontology
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-deontology
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
index 307a45be..62588f32 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-moral-nihilism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-moral-nihilism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
index 86ff4f18..9e9e9202 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-rule-utilitarianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-rule-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
index 2d6355ad..2c72d965 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-total-utilitarianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-total-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
index bca42ade..a0899644 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-utilitarianism
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
index 9c8751b1..c78298ab 100644
--- a/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: subscribes-to-virtue-ethics
-include: template_yaml
+include: _template_yaml
 task: persona_subscribes-to-virtue-ethics
diff --git a/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
index 2e199b4f..926757b7 100644
--- a/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: very-small-harm-justifies-very-large-benefit
-include: template_yaml
+include: _template_yaml
 task: persona_very-small-harm-justifies-very-large-benefit
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
index dcaaae8d..ece63367 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
index 0d93e683..2127443f 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
index 30c6614c..85cb55fe 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
index 6a74ff7e..d0f48579 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-cause-good-outcomes
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-cause-good-outcomes
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
index 6dea999f..9f76d11d 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
index f60f2904..980f881d 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
index f30cd8db..2c366fda 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
index 701438cc..0e38d035 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-defer-to-authorities
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-defer-to-authorities
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
index e4124803..f6a75602 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-defer-to-experts
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-defer-to-experts
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
index 5282c2e4..47b93eb7 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-engage-in-acausal-cooperation
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-engage-in-acausal-cooperation
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
index 9b2c3c03..bda22900 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-intentionally-be-non-HHH-to-become-more-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
index 0a917a36..8b751028 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
index b1a2cfd5..1f7f619c 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
index bd052db5..12fd2bc0 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
index 69751c08..229cd6ad 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-manipulate-overseers-to-think-it-is-HHH
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-manipulate-overseers-to-think-it-is-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
index d8bf15f8..2f4c55f5 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-rate-own-statements-highly-to-look-better
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-rate-own-statements-highly-to-look-better
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
index 3b9c808a..fd315345 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-use-physical-force-to-achieve-benevolent-goals
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-use-physical-force-to-achieve-benevolent-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
index e03ce6a7..e5f1889f 100644
--- a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
@@ -1,4 +1,4 @@
-# Generated by generate_configs.py
+# Generated by _generate_configs.py
 dataset_name: willingness-to-use-social-engineering-to-achieve-its-goals
-include: template_yaml
+include: _template_yaml
 task: persona_willingness-to-use-social-engineering-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/_template_yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
similarity index 68%
rename from lm_eval/tasks/model_written_evals/sycophancy/_template_yaml
rename to lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
index 921cf37e..c4549a91 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -1,9 +1,10 @@
 group: sycophancy
 dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_nlp_survey
 output_type: multiple_choice
 validation_split: validation
 target_delimiter: ""
-doc_to_text: "{{question}}"
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the best answer is"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
new file mode 100644
index 00000000..a26abeb9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -0,0 +1,11 @@
+group: sycophancy
+dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_philpapers2020
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the best answer is"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
new file mode 100644
index 00000000..d3f2610c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -0,0 +1,11 @@
+group: sycophancy
+dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_political_typology_quiz
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the better option is"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
-- 
GitLab


From d1c3cb3d0c6530d6a4a4eea3e5ddbe470557796e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 15:38:10 +0000
Subject: [PATCH 013/212] expanded benchmark to allow new source of prompt
 templates

---
 lm_eval/benchmarks/__init__.py                | 15 +++-
 lm_eval/benchmarks/flan/cot_template_yaml     | 21 ++++++
 lm_eval/benchmarks/flan/cot_yaml              | 37 ++++++++++
 lm_eval/benchmarks/flan/flan_boolq.yaml       | 32 +++++++++
 lm_eval/benchmarks/flan/flan_held_in.yaml     | 37 ++++++++++
 lm_eval/benchmarks/flan/held_in_template_yaml | 13 ++++
 lm_eval/prompts/__init__.py                   | 51 ++++++++++++--
 lm_eval/utils.py                              | 70 ++++++++++---------
 main.py                                       |  1 +
 9 files changed, 236 insertions(+), 41 deletions(-)
 create mode 100644 lm_eval/benchmarks/flan/cot_template_yaml
 create mode 100644 lm_eval/benchmarks/flan/cot_yaml
 create mode 100644 lm_eval/benchmarks/flan/flan_boolq.yaml
 create mode 100644 lm_eval/benchmarks/flan/flan_held_in.yaml
 create mode 100644 lm_eval/benchmarks/flan/held_in_template_yaml

diff --git a/lm_eval/benchmarks/__init__.py b/lm_eval/benchmarks/__init__.py
index 4924691d..e9ba7477 100644
--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
@@ -14,7 +14,7 @@ from lm_eval.api.registry import (
 def include_benchmarks(task_dir):
 
     for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+        if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
             for f in file_list:
                 if f.endswith(".yaml"):
                     try:
@@ -23,6 +23,9 @@ def include_benchmarks(task_dir):
                         with open(benchmark_path, "rb") as file:
                             yaml_config = yaml.full_load(file)
 
+                        if "prompts" in yaml_config:
+                            continue  # Skip it
+
                         assert "group" in yaml_config
                         group = yaml_config["group"]
                         all_task_list = yaml_config["task"]
@@ -34,6 +37,16 @@ def include_benchmarks(task_dir):
                         ]
 
                         for task_config in config_list:
+                            yaml_dir = os.path.dirname(benchmark_path)
+                            task_config = utils.load_yaml_config(
+                                yaml_config=task_config, yaml_dir=yaml_dir
+                            )
+                            if "use_prompt" in task_config:
+                                if "yaml" in task_config["use_prompt"]:
+                                    task_config["use_prompt"] = os.path.join(
+                                        root, task_config["use_prompt"]
+                                    )
+
                             var_configs = check_prompt_config(
                                 {
                                     **task_config,
diff --git a/lm_eval/benchmarks/flan/cot_template_yaml b/lm_eval/benchmarks/flan/cot_template_yaml
new file mode 100644
index 00000000..bca422ac
--- /dev/null
+++ b/lm_eval/benchmarks/flan/cot_template_yaml
@@ -0,0 +1,21 @@
+group: zero-shot-cot
+output_type: greedy_until
+validation_split: validation
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
diff --git a/lm_eval/benchmarks/flan/cot_yaml b/lm_eval/benchmarks/flan/cot_yaml
new file mode 100644
index 00000000..75ac4a10
--- /dev/null
+++ b/lm_eval/benchmarks/flan/cot_yaml
@@ -0,0 +1,37 @@
+group: flan_cot
+task:
+  - include: cot_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: promptsource:*
+    validation_split: validation
+  - include: cot_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: promptsource:*
+    validation_split: validation
+  - include: cot_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: promptsource:*
+    validation_split: dev_r1
+  - include: cot_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: promptsource:*
+    validation_split: dev_r2
+  - include: cot_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: promptsource:*
+    validation_split: dev_r3
+  - include: cot_template_yaml
+    task: ai2_arc
+    dataset_path: ARC-Easy
+    use_prompt: promptsource:*
+    validation_split: validation
+  - include: cot_template_yaml
+    task: ai2_arc
+    dataset_path: ARC-Challange
+    use_prompt: promptsource:*
+    validation_split: validation
diff --git a/lm_eval/benchmarks/flan/flan_boolq.yaml b/lm_eval/benchmarks/flan/flan_boolq.yaml
new file mode 100644
index 00000000..c99292f2
--- /dev/null
+++ b/lm_eval/benchmarks/flan/flan_boolq.yaml
@@ -0,0 +1,32 @@
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{text}}\n\nCan we conclude that {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-1":
+    doc_to_text: "{{text}}\n\nIs it true that {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-2":
+    doc_to_text: "{{text}}\n\n{{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-3":
+    doc_to_text: "Text: {{text}}\n\nQuestion: {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-4":
+    doc_to_text: "{{text}}\n\nWhat's the best answer to this question: {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-5":
+    doc_to_text: "{{text}}\nBased on the above text what's the best answer to this question: {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-6":
+    doc_to_text: "{{text}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-7":
+    doc_to_text: "{{text}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-8":
+    doc_to_text: "{{title}}\n\n{{text}}\n\nIs this statement correct \"{{question}}\"?\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
+  "template-9":
+    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{text}}\n\n{{options_}}"
+    doc_to_target: "{{answer}}"
diff --git a/lm_eval/benchmarks/flan/flan_held_in.yaml b/lm_eval/benchmarks/flan/flan_held_in.yaml
new file mode 100644
index 00000000..232db0e0
--- /dev/null
+++ b/lm_eval/benchmarks/flan/flan_held_in.yaml
@@ -0,0 +1,37 @@
+group: flan_held_in
+task:
+  - include: held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: flan_boolq.yaml:*
+    validation_split: validation
+  # - include: held_in_template_yaml
+  #   dataset_path: super_glue
+  #   dataset_name: rte
+  #   use_prompt: local:*
+  #   validation_split: validation
+  # - include: held_in_template_yaml
+  #   task: anli_r1
+  #   dataset_path: anli
+  #   use_prompt: local:*
+  #   validation_split: dev_r1
+  # - include: held_in_template_yaml
+  #   task: anli_r2
+  #   dataset_path: anli
+  #   use_prompt: local:*
+  #   validation_split: dev_r2
+  # - include: held_in_template_yaml
+  #   task: anli_r3
+  #   dataset_path: anli
+  #   use_prompt: local:*
+  #   validation_split: dev_r3
+  # - include: held_in_template_yaml
+  #   task: ai2_arc
+  #   dataset_path: ARC-Easy
+  #   use_prompt: local:*
+  #   validation_split: validation
+  # - include: held_in_template_yaml
+  #   task: ai2_arc
+  #   dataset_path: ARC-Challange
+  #   use_prompt: local:*
+  #   validation_split: validation
diff --git a/lm_eval/benchmarks/flan/held_in_template_yaml b/lm_eval/benchmarks/flan/held_in_template_yaml
new file mode 100644
index 00000000..f28774c6
--- /dev/null
+++ b/lm_eval/benchmarks/flan/held_in_template_yaml
@@ -0,0 +1,13 @@
+output_type: greedy_until
+validation_split: validation
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+  temperature: 0.0
diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 1669e9b0..701119cf 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -44,6 +44,14 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
             raise ValueError(
                 f"{prompt_name} not in prompt list {prompts.all_template_names}"
             )
+    elif ".yaml" in category_name:
+        import yaml
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_string = prompt_yaml_file["prompts"][prompt_name]
+        return PromptString(prompt_string)
     else:
         try:
             return PROMPT_REGISTRY[category_name][prompt_name]
@@ -56,13 +64,42 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
 
 def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
 
-    from promptsource.templates import DatasetTemplates
+    category_name, prompt_name = use_prompt.split(":")
 
-    if subset_name is None:
-        prompts = DatasetTemplates(dataset_name=dataset_name)
-    else:
-        prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
+    if category_name == "promptsource":
+        from promptsource.templates import DatasetTemplates
+
+        if subset_name is None:
+            prompts = DatasetTemplates(dataset_name=dataset_name)
+        else:
+            prompts = DatasetTemplates(
+                dataset_name=dataset_name, subset_name=subset_name
+            )
+
+        prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+
+    elif ".yaml" in category_name:
+        import yaml
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_list = utils.pattern_match(
+            prompt_name, prompt_yaml_file["prompts"].keys()
+        )
 
-    category_name, prompt_name = use_prompt.split(":")
-    prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
     return [":".join([category_name, prompt]) for prompt in prompt_list]
+
+
+class PromptString:
+    def __init__(prompt_string):
+        self.prompt_string = prompt_string
+
+    def apply(self, doc):
+
+        doc_to_text = self.prompt_string["doc_to_text"]
+        doc_to_target = self.prompt_string["doc_to_target"]
+        text_string = utils.apply_template(doc_to_text, doc)
+        target_string = utils.apply_template(doc_to_target, doc)
+
+        return [text_string, target_string]
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index d96e9586..7c55d11d 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -412,39 +412,43 @@ def import_function(loader, node):
 yaml.add_constructor("!function", import_function)
 
 
-def load_yaml_config(yaml_path):
-    with open(yaml_path, "rb") as file:
-        yaml_config = yaml.full_load(file)
-        yaml_dir = os.path.dirname(yaml_path)
-
-        if "include" in yaml_config:
-            include_path = yaml_config["include"]
-            del yaml_config["include"]
-
-            if type(include_path) == str:
-                include_path = [include_path]
-
-            # Load from the last one first
-            include_path.reverse()
-            final_yaml_config = {}
-            for path in include_path:
-
-                # Assumes that path is a full path.
-                # If not found, assume the included yaml
-                # is in the same dir as the original yaml
-                if not os.path.isfile(path):
-                    path = os.path.join(yaml_dir, path)
-
-                try:
-                    included_yaml_config = load_yaml_config(path)
-                    final_yaml_config.update(included_yaml_config)
-                except Exception as ex:
-                    # If failed to load, ignore
-                    raise ex
-
-            final_yaml_config.update(yaml_config)
-            return final_yaml_config
-        return yaml_config
+def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
+
+    if yaml_config is None:
+        with open(yaml_path, "rb") as file:
+            yaml_config = yaml.full_load(file)
+            yaml_dir = os.path.dirname(yaml_path)
+
+    assert yaml_dir is not None
+
+    if "include" in yaml_config:
+        include_path = yaml_config["include"]
+        del yaml_config["include"]
+
+        if type(include_path) == str:
+            include_path = [include_path]
+
+        # Load from the last one first
+        include_path.reverse()
+        final_yaml_config = {}
+        for path in include_path:
+
+            # Assumes that path is a full path.
+            # If not found, assume the included yaml
+            # is in the same dir as the original yaml
+            if not os.path.isfile(path):
+                path = os.path.join(yaml_dir, path)
+
+            try:
+                included_yaml_config = load_yaml_config(path)
+                final_yaml_config.update(included_yaml_config)
+            except Exception as ex:
+                # If failed to load, ignore
+                raise ex
+
+        final_yaml_config.update(yaml_config)
+        return final_yaml_config
+    return yaml_config
 
 
 def regex_replace(string, pattern, repl, count=0):
diff --git a/main.py b/main.py
index 169ef466..bb5647ef 100644
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger
 from lm_eval.tasks import include_task_folder
+from lm_eval.benchmarks import include_benchmarks
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-- 
GitLab


From 4bdf11e1785bc6b2005153449975d05eaf7fd8b1 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 16:09:58 +0000
Subject: [PATCH 014/212] udpated to work

---
 lm_eval/benchmarks/flan/flan_held_in.yaml     | 12 +++++------
 .../{ => prompt_templates}/flan_boolq.yaml    | 20 +++++++++----------
 lm_eval/prompts/__init__.py                   |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)
 rename lm_eval/benchmarks/flan/{ => prompt_templates}/flan_boolq.yaml (60%)

diff --git a/lm_eval/benchmarks/flan/flan_held_in.yaml b/lm_eval/benchmarks/flan/flan_held_in.yaml
index 232db0e0..3ed0fb5d 100644
--- a/lm_eval/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan/flan_held_in.yaml
@@ -3,13 +3,13 @@ task:
   - include: held_in_template_yaml
     dataset_path: super_glue
     dataset_name: boolq
-    use_prompt: flan_boolq.yaml:*
+    use_prompt: prompt_templates/flan_boolq.yaml:*
+    validation_split: validation
+  - include: held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: prompt_templates/flan_rte.yaml:*
     validation_split: validation
-  # - include: held_in_template_yaml
-  #   dataset_path: super_glue
-  #   dataset_name: rte
-  #   use_prompt: local:*
-  #   validation_split: validation
   # - include: held_in_template_yaml
   #   task: anli_r1
   #   dataset_path: anli
diff --git a/lm_eval/benchmarks/flan/flan_boolq.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
similarity index 60%
rename from lm_eval/benchmarks/flan/flan_boolq.yaml
rename to lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
index c99292f2..ebb7a1f9 100644
--- a/lm_eval/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
@@ -1,32 +1,32 @@
 # Flan Prompt Templates
 prompts:
   "template-0":
-    doc_to_text: "{{text}}\n\nCan we conclude that {{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-1":
-    doc_to_text: "{{text}}\n\nIs it true that {{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-2":
-    doc_to_text: "{{text}}\n\n{{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-3":
-    doc_to_text: "Text: {{text}}\n\nQuestion: {{question}}?\n\n{{options_}}"
+    doc_to_text: "Text: {{text}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-4":
-    doc_to_text: "{{text}}\n\nWhat's the best answer to this question: {{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-5":
-    doc_to_text: "{{text}}\nBased on the above text what's the best answer to this question: {{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-6":
-    doc_to_text: "{{text}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\n{{options_}}"
+    doc_to_text: "{{text}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-7":
-    doc_to_text: "{{text}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\n{{options_}}"
+    doc_to_text: "{{text}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-8":
-    doc_to_text: "{{title}}\n\n{{text}}\n\nIs this statement correct \"{{question}}\"?\n\n{{options_}}"
+    doc_to_text: "{{title}}\n\n{{text}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
   "template-9":
-    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{text}}\n\n{{options_}}"
+    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{text}}\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{answer}}"
diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 701119cf..1f746607 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -92,7 +92,7 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
 
 
 class PromptString:
-    def __init__(prompt_string):
+    def __init__(self, prompt_string):
         self.prompt_string = prompt_string
 
     def apply(self, doc):
-- 
GitLab


From db79d92743784911f18829aae9adaeab186d222d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 16:34:16 +0000
Subject: [PATCH 015/212] added subtasks

---
 lm_eval/benchmarks/flan/flan_held_in.yaml     | 30 +++++++++----------
 .../flan/prompt_templates/flan_anli.yaml      | 29 ++++++++++++++++++
 .../flan/prompt_templates/flan_rte.yaml       | 29 ++++++++++++++++++
 3 files changed, 73 insertions(+), 15 deletions(-)
 create mode 100644 lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
 create mode 100644 lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml

diff --git a/lm_eval/benchmarks/flan/flan_held_in.yaml b/lm_eval/benchmarks/flan/flan_held_in.yaml
index 3ed0fb5d..3d558747 100644
--- a/lm_eval/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan/flan_held_in.yaml
@@ -10,21 +10,21 @@ task:
     dataset_name: rte
     use_prompt: prompt_templates/flan_rte.yaml:*
     validation_split: validation
-  # - include: held_in_template_yaml
-  #   task: anli_r1
-  #   dataset_path: anli
-  #   use_prompt: local:*
-  #   validation_split: dev_r1
-  # - include: held_in_template_yaml
-  #   task: anli_r2
-  #   dataset_path: anli
-  #   use_prompt: local:*
-  #   validation_split: dev_r2
-  # - include: held_in_template_yaml
-  #   task: anli_r3
-  #   dataset_path: anli
-  #   use_prompt: local:*
-  #   validation_split: dev_r3
+  - include: held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r1
+  - include: held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r2
+  - include: held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r3
   # - include: held_in_template_yaml
   #   task: ai2_arc
   #   dataset_path: ARC-Easy
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
new file mode 100644
index 00000000..206fb00e
--- /dev/null
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -0,0 +1,29 @@
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{answer}}"
+  "template-1":
+    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
+  "template-2":
+    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
+  "template-3":
+    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
+  "template-4":
+    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{answer}}"
+  "template-5":
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{answer}}"
+  "template-6":
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
+  "template-7":
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
+  "template-8":
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{answer}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
new file mode 100644
index 00000000..79356aed
--- /dev/null
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
@@ -0,0 +1,29 @@
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{answer}}"
+  "template-1":
+    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{answer}}"
+  "template-2":
+    doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{answer}}"
+  "template-3":
+    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{answer}}"
+  "template-4":
+    doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
+    doc_to_target: "{{answer}}"
+  "template-5":
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
+    doc_to_target: "{{answer}}"
+  "template-6":
+    doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
+    doc_to_target: "{{answer}}"
+  "template-7":
+    doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
+    doc_to_target: "{{answer}}"
+  "template-8":
+    doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{answer}}"
-- 
GitLab


From 7e875dcf81e051b00500efa12ef93e8800376eef Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 29 Aug 2023 16:52:44 +0000
Subject: [PATCH 016/212] moved templates, edit templates

---
 .../flan/{cot_yaml => flan_cot_yaml}          |  0
 lm_eval/benchmarks/flan/flan_held_in.yaml     | 37 -----------------
 .../flan/prompt_templates/flan_anli.yaml      | 36 ++++++++---------
 .../flan/prompt_templates/flan_boolq.yaml     | 40 +++++++++----------
 .../flan/prompt_templates/flan_rte.yaml       | 18 ++++-----
 .../{ => yaml_templates}/cot_template_yaml    |  2 +-
 .../held_in_template_yaml                     |  0
 7 files changed, 48 insertions(+), 85 deletions(-)
 rename lm_eval/benchmarks/flan/{cot_yaml => flan_cot_yaml} (100%)
 delete mode 100644 lm_eval/benchmarks/flan/flan_held_in.yaml
 rename lm_eval/benchmarks/flan/{ => yaml_templates}/cot_template_yaml (95%)
 rename lm_eval/benchmarks/flan/{ => yaml_templates}/held_in_template_yaml (100%)

diff --git a/lm_eval/benchmarks/flan/cot_yaml b/lm_eval/benchmarks/flan/flan_cot_yaml
similarity index 100%
rename from lm_eval/benchmarks/flan/cot_yaml
rename to lm_eval/benchmarks/flan/flan_cot_yaml
diff --git a/lm_eval/benchmarks/flan/flan_held_in.yaml b/lm_eval/benchmarks/flan/flan_held_in.yaml
deleted file mode 100644
index 3d558747..00000000
--- a/lm_eval/benchmarks/flan/flan_held_in.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-group: flan_held_in
-task:
-  - include: held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: prompt_templates/flan_boolq.yaml:*
-    validation_split: validation
-  - include: held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: prompt_templates/flan_rte.yaml:*
-    validation_split: validation
-  - include: held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: prompt_templates/flan_anli.yaml:*
-    validation_split: dev_r1
-  - include: held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: prompt_templates/flan_anli.yaml:*
-    validation_split: dev_r2
-  - include: held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: prompt_templates/flan_anli.yaml:*
-    validation_split: dev_r3
-  # - include: held_in_template_yaml
-  #   task: ai2_arc
-  #   dataset_path: ARC-Easy
-  #   use_prompt: local:*
-  #   validation_split: validation
-  # - include: held_in_template_yaml
-  #   task: ai2_arc
-  #   dataset_path: ARC-Challange
-  #   use_prompt: local:*
-  #   validation_split: validation
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
index 206fb00e..99819344 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -1,29 +1,29 @@
 # Flan Prompt Templates
 prompts:
   "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
   "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
index ebb7a1f9..dc6703d5 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
@@ -1,32 +1,32 @@
 # Flan Prompt Templates
 prompts:
   "template-0":
-    doc_to_text: "{{text}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-1":
-    doc_to_text: "{{text}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-2":
-    doc_to_text: "{{text}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-3":
-    doc_to_text: "Text: {{text}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-4":
-    doc_to_text: "{{text}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-5":
-    doc_to_text: "{{text}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-6":
-    doc_to_text: "{{text}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-7":
-    doc_to_text: "{{text}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-8":
-    doc_to_text: "{{title}}\n\n{{text}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
   "template-9":
-    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{text}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{answer}}"
+    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
index 79356aed..616829f0 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
@@ -2,28 +2,28 @@
 prompts:
   "template-0":
     doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-1":
     doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-2":
     doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-3":
     doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-4":
     doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-5":
     doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-6":
     doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-7":
     doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
   "template-8":
     doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{answer}}"
+    doc_to_target: "{{[['yes', 'no']][label]}}"
diff --git a/lm_eval/benchmarks/flan/cot_template_yaml b/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
similarity index 95%
rename from lm_eval/benchmarks/flan/cot_template_yaml
rename to lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
index bca422ac..0cb0d16e 100644
--- a/lm_eval/benchmarks/flan/cot_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
@@ -1,4 +1,4 @@
-group: zero-shot-cot
+group: flan-cot
 output_type: greedy_until
 validation_split: validation
 doc_to_target: "{{answer}}"
diff --git a/lm_eval/benchmarks/flan/held_in_template_yaml b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
similarity index 100%
rename from lm_eval/benchmarks/flan/held_in_template_yaml
rename to lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
-- 
GitLab


From 0a5bd86978c6c66b03b87f85b872d7e17232a7f9 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 30 Aug 2023 07:17:52 +0000
Subject: [PATCH 017/212] updated yaml templates

---
 .../flan/prompt_templates/flan_anli.yaml       | 18 +++++++++---------
 .../flan/prompt_templates/flan_boolq.yaml      |  3 ++-
 .../flan/prompt_templates/flan_rte.yaml        | 18 +++++++++---------
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
index 99819344..525e9e0c 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -2,28 +2,28 @@
 prompts:
   "template-0":
     doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-1":
     doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-2":
     doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-3":
     doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-4":
     doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-5":
     doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-6":
     doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-7":
     doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-8":
     doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[['Yes', 'It\'s impossible to say', 'No']][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
index dc6703d5..f8c8ebfc 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_boolq.yaml
@@ -25,7 +25,8 @@ prompts:
     doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{['no', 'yes'][label]}}"
   "template-8":
-    doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+    # doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
     doc_to_target: "{{['no', 'yes'][label]}}"
   "template-9":
     doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
index 616829f0..7893eae4 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_rte.yaml
@@ -2,28 +2,28 @@
 prompts:
   "template-0":
     doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-1":
     doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-2":
     doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-3":
     doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-4":
     doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-5":
     doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-6":
     doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-7":
     doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
   "template-8":
     doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{[['yes', 'no']][label]}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
-- 
GitLab


From c0cb0be7b764e03fc07dbb36345c78381a02c8b0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 30 Aug 2023 07:18:18 +0000
Subject: [PATCH 018/212] moved benchmark up

---
 lm_eval/benchmarks/flan_held_in.yaml | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 lm_eval/benchmarks/flan_held_in.yaml

diff --git a/lm_eval/benchmarks/flan_held_in.yaml b/lm_eval/benchmarks/flan_held_in.yaml
new file mode 100644
index 00000000..91979f9f
--- /dev/null
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -0,0 +1,37 @@
+group: flan_held_in
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: flan/prompt_templates/flan_boolq.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates//held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: flan/prompt_templates/flan_rte.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates//held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r1
+  - include: flan/yaml_templates//held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r2
+  - include: flan/yaml_templates//held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r3
+  # - include: flan/yaml_templates//held_in_template_yaml
+  #   task: ai2_arc
+  #   dataset_path: ARC-Easy
+  #   use_prompt: local:*
+  #   validation_split: validation
+  # - include: flan/yaml_templates//held_in_template_yaml
+  #   task: ai2_arc
+  #   dataset_path: ARC-Challange
+  #   use_prompt: local:*
+  #   validation_split: validation
-- 
GitLab


From 24754ee41de38d24b2a86228c6e10db03a9fbf48 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 30 Aug 2023 16:35:39 +0000
Subject: [PATCH 019/212] add bbh

---
 lm_eval/tasks/bbh/README.md                   | 45 +++++++++++++++++++
 lm_eval/tasks/bbh/_generate_configs.py        | 29 ++++++++++++
 lm_eval/tasks/bbh/_template_yaml              | 17 +++++++
 lm_eval/tasks/bbh/boolean_expressions.yaml    |  4 ++
 lm_eval/tasks/bbh/causal_judgement.yaml       |  4 ++
 lm_eval/tasks/bbh/date_understanding.yaml     |  4 ++
 lm_eval/tasks/bbh/disambiguation_qa.yaml      |  4 ++
 lm_eval/tasks/bbh/dyck_languages.yaml         |  4 ++
 lm_eval/tasks/bbh/formal_fallacies.yaml       |  4 ++
 lm_eval/tasks/bbh/geometric_shapes.yaml       |  4 ++
 lm_eval/tasks/bbh/hyperbaton.yaml             |  4 ++
 .../bbh/logical_deduction_five_objects.yaml   |  4 ++
 .../bbh/logical_deduction_seven_objects.yaml  |  4 ++
 .../bbh/logical_deduction_three_objects.yaml  |  4 ++
 lm_eval/tasks/bbh/movie_recommendation.yaml   |  4 ++
 .../tasks/bbh/multistep_arithmetic_two.yaml   |  4 ++
 lm_eval/tasks/bbh/navigate.yaml               |  4 ++
 lm_eval/tasks/bbh/object_counting.yaml        |  4 ++
 lm_eval/tasks/bbh/penguins_in_a_table.yaml    |  4 ++
 .../bbh/reasoning_about_colored_objects.yaml  |  4 ++
 lm_eval/tasks/bbh/ruin_names.yaml             |  4 ++
 .../salient_translation_error_detection.yaml  |  4 ++
 lm_eval/tasks/bbh/snarks.yaml                 |  4 ++
 lm_eval/tasks/bbh/sports_understanding.yaml   |  4 ++
 lm_eval/tasks/bbh/temporal_sequences.yaml     |  4 ++
 ...racking_shuffled_objects_five_objects.yaml |  4 ++
 ...acking_shuffled_objects_seven_objects.yaml |  4 ++
 ...acking_shuffled_objects_three_objects.yaml |  4 ++
 lm_eval/tasks/bbh/web_of_lies.yaml            |  4 ++
 lm_eval/tasks/bbh/word_sorting.yaml           |  4 ++
 30 files changed, 199 insertions(+)
 create mode 100644 lm_eval/tasks/bbh/README.md
 create mode 100644 lm_eval/tasks/bbh/_generate_configs.py
 create mode 100644 lm_eval/tasks/bbh/_template_yaml
 create mode 100644 lm_eval/tasks/bbh/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/bbh/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/bbh/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bbh/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bbh/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/bbh/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bbh/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
 create mode 100644 lm_eval/tasks/bbh/navigate.yaml
 create mode 100644 lm_eval/tasks/bbh/object_counting.yaml
 create mode 100644 lm_eval/tasks/bbh/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bbh/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bbh/snarks.yaml
 create mode 100644 lm_eval/tasks/bbh/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/bbh/word_sorting.yaml

diff --git a/lm_eval/tasks/bbh/README.md b/lm_eval/tasks/bbh/README.md
new file mode 100644
index 00000000..91be60fc
--- /dev/null
+++ b/lm_eval/tasks/bbh/README.md
@@ -0,0 +1,45 @@
+# BigBenchHard
+
+## Paper
+Title: `Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them`
+Abstract: https://arxiv.org/abs/2210.09261
+
+A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
+These are the task for which prior language model evaluations did not outperform
+the average human-rater.
+
+Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
+
+
+## Citation
+```
+@article{suzgun2022challenging,
+  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
+  author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
+  journal={arXiv preprint arXiv:2210.09261},
+  year={2022}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `bbh`
+
+#### Tasks
+
+- ...
+
+### Checklist
+
+- [x] Is in Eval-harness v1.0 ?
+- [ ] Has been checked for regression from v1.0?
+- [ ] Has been checked for equivalence with original paper methodology?
+- [ ] "Main" checked variant clearly denoted?
+
+### Variant Wishlist
+
+- [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
+- [ ] Using Verifiers
+- [ ] Majority voting "without CoT"
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
new file mode 100644
index 00000000..80600809
--- /dev/null
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -0,0 +1,29 @@
+import yaml
+import inspect
+import datasets
+
+from tqdm import tqdm
+
+
+def main() -> None:
+
+    dataset_path = "lukaemon/bbh"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+        file_name = f"{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_template_yaml",
+                        "task": f"{dataset_path.split('/')[-1]}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/lm_eval/tasks/bbh/_template_yaml b/lm_eval/tasks/bbh/_template_yaml
new file mode 100644
index 00000000..af6b74b3
--- /dev/null
+++ b/lm_eval/tasks/bbh/_template_yaml
@@ -0,0 +1,17 @@
+group: bbh
+dataset_path: lukaemon/bbh
+output_type: greedy_until
+test_split: test
+doc_to_text: "{{input}}"
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+  temperature: 0.0
diff --git a/lm_eval/tasks/bbh/boolean_expressions.yaml b/lm_eval/tasks/bbh/boolean_expressions.yaml
new file mode 100644
index 00000000..d9895c81
--- /dev/null
+++ b/lm_eval/tasks/bbh/boolean_expressions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: boolean_expressions
+include: _template_yaml
+task: bbh_boolean_expressions
diff --git a/lm_eval/tasks/bbh/causal_judgement.yaml b/lm_eval/tasks/bbh/causal_judgement.yaml
new file mode 100644
index 00000000..c3d48d53
--- /dev/null
+++ b/lm_eval/tasks/bbh/causal_judgement.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: causal_judgement
+include: _template_yaml
+task: bbh_causal_judgement
diff --git a/lm_eval/tasks/bbh/date_understanding.yaml b/lm_eval/tasks/bbh/date_understanding.yaml
new file mode 100644
index 00000000..5f60efbe
--- /dev/null
+++ b/lm_eval/tasks/bbh/date_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: date_understanding
+include: _template_yaml
+task: bbh_date_understanding
diff --git a/lm_eval/tasks/bbh/disambiguation_qa.yaml b/lm_eval/tasks/bbh/disambiguation_qa.yaml
new file mode 100644
index 00000000..b043460e
--- /dev/null
+++ b/lm_eval/tasks/bbh/disambiguation_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: disambiguation_qa
+include: _template_yaml
+task: bbh_disambiguation_qa
diff --git a/lm_eval/tasks/bbh/dyck_languages.yaml b/lm_eval/tasks/bbh/dyck_languages.yaml
new file mode 100644
index 00000000..6b6648d0
--- /dev/null
+++ b/lm_eval/tasks/bbh/dyck_languages.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: dyck_languages
+include: _template_yaml
+task: bbh_dyck_languages
diff --git a/lm_eval/tasks/bbh/formal_fallacies.yaml b/lm_eval/tasks/bbh/formal_fallacies.yaml
new file mode 100644
index 00000000..18d30c91
--- /dev/null
+++ b/lm_eval/tasks/bbh/formal_fallacies.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: formal_fallacies
+include: _template_yaml
+task: bbh_formal_fallacies
diff --git a/lm_eval/tasks/bbh/geometric_shapes.yaml b/lm_eval/tasks/bbh/geometric_shapes.yaml
new file mode 100644
index 00000000..9616b6d6
--- /dev/null
+++ b/lm_eval/tasks/bbh/geometric_shapes.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: geometric_shapes
+include: _template_yaml
+task: bbh_geometric_shapes
diff --git a/lm_eval/tasks/bbh/hyperbaton.yaml b/lm_eval/tasks/bbh/hyperbaton.yaml
new file mode 100644
index 00000000..d1ff5bf8
--- /dev/null
+++ b/lm_eval/tasks/bbh/hyperbaton.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: hyperbaton
+include: _template_yaml
+task: bbh_hyperbaton
diff --git a/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..91e6ec74
--- /dev/null
+++ b/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: logical_deduction_five_objects
+include: _template_yaml
+task: bbh_logical_deduction_five_objects
diff --git a/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..342cf563
--- /dev/null
+++ b/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: logical_deduction_seven_objects
+include: _template_yaml
+task: bbh_logical_deduction_seven_objects
diff --git a/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..6669c6c8
--- /dev/null
+++ b/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: logical_deduction_three_objects
+include: _template_yaml
+task: bbh_logical_deduction_three_objects
diff --git a/lm_eval/tasks/bbh/movie_recommendation.yaml b/lm_eval/tasks/bbh/movie_recommendation.yaml
new file mode 100644
index 00000000..af884eec
--- /dev/null
+++ b/lm_eval/tasks/bbh/movie_recommendation.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: movie_recommendation
+include: _template_yaml
+task: bbh_movie_recommendation
diff --git a/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
new file mode 100644
index 00000000..2ab191b8
--- /dev/null
+++ b/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: multistep_arithmetic_two
+include: _template_yaml
+task: bbh_multistep_arithmetic_two
diff --git a/lm_eval/tasks/bbh/navigate.yaml b/lm_eval/tasks/bbh/navigate.yaml
new file mode 100644
index 00000000..f737a9c5
--- /dev/null
+++ b/lm_eval/tasks/bbh/navigate.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: navigate
+include: _template_yaml
+task: bbh_navigate
diff --git a/lm_eval/tasks/bbh/object_counting.yaml b/lm_eval/tasks/bbh/object_counting.yaml
new file mode 100644
index 00000000..606bd92d
--- /dev/null
+++ b/lm_eval/tasks/bbh/object_counting.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: object_counting
+include: _template_yaml
+task: bbh_object_counting
diff --git a/lm_eval/tasks/bbh/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/penguins_in_a_table.yaml
new file mode 100644
index 00000000..25e183ce
--- /dev/null
+++ b/lm_eval/tasks/bbh/penguins_in_a_table.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: penguins_in_a_table
+include: _template_yaml
+task: bbh_penguins_in_a_table
diff --git a/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..785e0b2e
--- /dev/null
+++ b/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: reasoning_about_colored_objects
+include: _template_yaml
+task: bbh_reasoning_about_colored_objects
diff --git a/lm_eval/tasks/bbh/ruin_names.yaml b/lm_eval/tasks/bbh/ruin_names.yaml
new file mode 100644
index 00000000..aef28b1c
--- /dev/null
+++ b/lm_eval/tasks/bbh/ruin_names.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: ruin_names
+include: _template_yaml
+task: bbh_ruin_names
diff --git a/lm_eval/tasks/bbh/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..433867fe
--- /dev/null
+++ b/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: salient_translation_error_detection
+include: _template_yaml
+task: bbh_salient_translation_error_detection
diff --git a/lm_eval/tasks/bbh/snarks.yaml b/lm_eval/tasks/bbh/snarks.yaml
new file mode 100644
index 00000000..49f57d20
--- /dev/null
+++ b/lm_eval/tasks/bbh/snarks.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: snarks
+include: _template_yaml
+task: bbh_snarks
diff --git a/lm_eval/tasks/bbh/sports_understanding.yaml b/lm_eval/tasks/bbh/sports_understanding.yaml
new file mode 100644
index 00000000..cf84b1e1
--- /dev/null
+++ b/lm_eval/tasks/bbh/sports_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: sports_understanding
+include: _template_yaml
+task: bbh_sports_understanding
diff --git a/lm_eval/tasks/bbh/temporal_sequences.yaml b/lm_eval/tasks/bbh/temporal_sequences.yaml
new file mode 100644
index 00000000..b3f5c0af
--- /dev/null
+++ b/lm_eval/tasks/bbh/temporal_sequences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: temporal_sequences
+include: _template_yaml
+task: bbh_temporal_sequences
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..d4ca2fe0
--- /dev/null
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: tracking_shuffled_objects_five_objects
+include: _template_yaml
+task: bbh_tracking_shuffled_objects_five_objects
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..20fff820
--- /dev/null
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: tracking_shuffled_objects_seven_objects
+include: _template_yaml
+task: bbh_tracking_shuffled_objects_seven_objects
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..f219d30c
--- /dev/null
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: tracking_shuffled_objects_three_objects
+include: _template_yaml
+task: bbh_tracking_shuffled_objects_three_objects
diff --git a/lm_eval/tasks/bbh/web_of_lies.yaml b/lm_eval/tasks/bbh/web_of_lies.yaml
new file mode 100644
index 00000000..18dcb591
--- /dev/null
+++ b/lm_eval/tasks/bbh/web_of_lies.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: web_of_lies
+include: _template_yaml
+task: bbh_web_of_lies
diff --git a/lm_eval/tasks/bbh/word_sorting.yaml b/lm_eval/tasks/bbh/word_sorting.yaml
new file mode 100644
index 00000000..11725e0a
--- /dev/null
+++ b/lm_eval/tasks/bbh/word_sorting.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: word_sorting
+include: _template_yaml
+task: bbh_word_sorting
-- 
GitLab


From 525f1d1593b754cca4c9282e4f484432708db035 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 30 Aug 2023 16:36:05 +0000
Subject: [PATCH 020/212] added more flan subtasks

---
 lm_eval/benchmarks/flan/flan_cot_yaml         | 37 -------------------
 .../flan/prompt_templates/flan_bbh.yaml       | 29 +++++++++++++++
 lm_eval/benchmarks/flan_cot.yaml              | 11 ++++++
 lm_eval/benchmarks/flan_held_in.yaml          | 12 +++---
 lm_eval/benchmarks/flan_held_out.yaml         |  4 ++
 5 files changed, 50 insertions(+), 43 deletions(-)
 delete mode 100644 lm_eval/benchmarks/flan/flan_cot_yaml
 create mode 100644 lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
 create mode 100644 lm_eval/benchmarks/flan_cot.yaml
 create mode 100644 lm_eval/benchmarks/flan_held_out.yaml

diff --git a/lm_eval/benchmarks/flan/flan_cot_yaml b/lm_eval/benchmarks/flan/flan_cot_yaml
deleted file mode 100644
index 75ac4a10..00000000
--- a/lm_eval/benchmarks/flan/flan_cot_yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-group: flan_cot
-task:
-  - include: cot_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: cot_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: cot_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: promptsource:*
-    validation_split: dev_r1
-  - include: cot_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: promptsource:*
-    validation_split: dev_r2
-  - include: cot_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: promptsource:*
-    validation_split: dev_r3
-  - include: cot_template_yaml
-    task: ai2_arc
-    dataset_path: ARC-Easy
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: cot_template_yaml
-    task: ai2_arc
-    dataset_path: ARC-Challange
-    use_prompt: promptsource:*
-    validation_split: validation
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
new file mode 100644
index 00000000..525e9e0c
--- /dev/null
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
@@ -0,0 +1,29 @@
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-1":
+    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-2":
+    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-3":
+    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-4":
+    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-5":
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-6":
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-7":
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+  "template-8":
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
diff --git a/lm_eval/benchmarks/flan_cot.yaml b/lm_eval/benchmarks/flan_cot.yaml
new file mode 100644
index 00000000..ff6edc24
--- /dev/null
+++ b/lm_eval/benchmarks/flan_cot.yaml
@@ -0,0 +1,11 @@
+group: flan_cot
+task:
+  - include: flan/yaml_templates/cot_template_yaml
+    dataset_path: gsmk
+    dataset_name: boolq
+    use_prompt: promptsource:*
+    validation_split: validation
+  - include: flan/yaml_templates/cot_template_yaml
+    dataset_path: EleutherAI/asdiv
+    use_prompt: promptsource:*
+    validation_split: validation
diff --git a/lm_eval/benchmarks/flan_held_in.yaml b/lm_eval/benchmarks/flan_held_in.yaml
index 91979f9f..a560bda8 100644
--- a/lm_eval/benchmarks/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -5,32 +5,32 @@ task:
     dataset_name: boolq
     use_prompt: flan/prompt_templates/flan_boolq.yaml:*
     validation_split: validation
-  - include: flan/yaml_templates//held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: rte
     use_prompt: flan/prompt_templates/flan_rte.yaml:*
     validation_split: validation
-  - include: flan/yaml_templates//held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r1
     dataset_path: anli
     use_prompt: flan/prompt_templates/flan_anli.yaml:*
     validation_split: dev_r1
-  - include: flan/yaml_templates//held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r2
     dataset_path: anli
     use_prompt: flan/prompt_templates/flan_anli.yaml:*
     validation_split: dev_r2
-  - include: flan/yaml_templates//held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r3
     dataset_path: anli
     use_prompt: flan/prompt_templates/flan_anli.yaml:*
     validation_split: dev_r3
-  # - include: flan/yaml_templates//held_in_template_yaml
+  # - include: flan/yaml_templates/held_in_template_yaml
   #   task: ai2_arc
   #   dataset_path: ARC-Easy
   #   use_prompt: local:*
   #   validation_split: validation
-  # - include: flan/yaml_templates//held_in_template_yaml
+  # - include: flan/yaml_templates/held_in_template_yaml
   #   task: ai2_arc
   #   dataset_path: ARC-Challange
   #   use_prompt: local:*
diff --git a/lm_eval/benchmarks/flan_held_out.yaml b/lm_eval/benchmarks/flan_held_out.yaml
new file mode 100644
index 00000000..4cd56468
--- /dev/null
+++ b/lm_eval/benchmarks/flan_held_out.yaml
@@ -0,0 +1,4 @@
+group: flan_held_out
+task:
+  - bbh
+  - mmlu
-- 
GitLab


From c3764d2c31bfa0e45c8f57fb200c7b4d2642e8c8 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 31 Aug 2023 11:38:41 +0000
Subject: [PATCH 021/212] update to generate bbh configs

---
 .../flan/prompt_templates/flan_anli.yaml       | 18 +++++++++---------
 lm_eval/tasks/bbh/_generate_configs.py         |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
index 525e9e0c..7dae0ce0 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -2,28 +2,28 @@
 prompts:
   "template-0":
     doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: """{{["Yes", "It's impossible to say", "No"][label]}}"""
   "template-1":
     doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-2":
     doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-3":
     doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-4":
     doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-5":
     doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-6":
     doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-7":
     doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
   "template-8":
     doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index 80600809..40e4c07d 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -26,4 +26,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
-- 
GitLab


From a81ef1a798691b2e02129c7496f4c5883c951611 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 1 Sep 2023 12:30:40 +0000
Subject: [PATCH 022/212] edit stop token

---
 lm_eval/tasks/super_glue/boolq/t5-prompt.yaml   | 3 +++
 lm_eval/tasks/super_glue/cb/t5-prompt.yaml      | 3 +++
 lm_eval/tasks/super_glue/cb/t5_utils.py         | 2 +-
 lm_eval/tasks/super_glue/copa/t5-prompt.yaml    | 3 +++
 lm_eval/tasks/super_glue/multirc/t5-prompt.yaml | 2 --
 lm_eval/tasks/super_glue/record/t5-prompt.yaml  | 3 +++
 lm_eval/tasks/super_glue/rte/t5-prompt.yaml     | 3 +++
 lm_eval/tasks/super_glue/wic/t5-prompt.yaml     | 3 +++
 lm_eval/tasks/super_glue/wsc/t5-prompt.yaml     | 3 +++
 9 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
index bda3e614..8ebd82fb 100644
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -9,6 +9,9 @@ output_type: greedy_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
index 61b0d8a4..a16505fa 100644
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -9,6 +9,9 @@ output_type: greedy_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/cb/t5_utils.py b/lm_eval/tasks/super_glue/cb/t5_utils.py
index caf84390..644c2111 100644
--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
@@ -4,7 +4,7 @@ import sklearn.metrics
 def mean_3class_f1(predictions, references):  # This is a passthrough function
 
     string_label = ["entailment", "contradiction", "neutral"]
-    predictions = string_label.index(predictions[0])
+    predictions = string_label.index(predictions[0]) if predictions[0] in string_label else 0
     references = string_label.index(references[0])
 
     return (predictions, references)
diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
index e3f6f04a..47aaf275 100644
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -9,6 +9,9 @@ output_type: greedy_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['choice1', 'choice2']
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
index d0a62652..008c1443 100644
--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -12,8 +12,6 @@ doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', gr
 generation_kwargs:
   until:
     - "</s>"
-  do_sample: false
-  temperature: 0.5
 metric_list:
   - metric: !function t5_utils.f1
     aggregation: !function t5_utils.agg_f1
diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
index 82dcc383..c1db59ad 100644
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -8,6 +8,9 @@ output_type: greedy_until
 process_docs: !function t5_utils.process_docs
 doc_to_text: !function t5_utils.doc_to_text
 doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: !function t5_utils.em
     aggregation: !function t5_utils.squad_em_agg
diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
index 2725b70e..870dc363 100644
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -9,6 +9,9 @@ output_type: greedy_until
 doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'not_entailment']
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
index a48cfaee..da6a9411 100644
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -9,6 +9,9 @@ output_type: greedy_until
 doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
index 38c73214..e0ef7538 100644
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -8,6 +8,9 @@ validation_split: validation
 output_type: greedy_until
 doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: accuracy
     aggregation: mean
-- 
GitLab


From f23ae748125513cb6d23254b14c33fee23a567d6 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Sun, 3 Sep 2023 12:12:13 +0000
Subject: [PATCH 023/212] add mmlu variants

---
 lm_eval/tasks/mmlu/_cot_prompts.json          |   1 +
 ...gen_all_splits.py => _generate_configs.py} |  24 ++-
 .../hendrycks_test_original_default.yaml      |   0
 .../_mmlu_flan_cot_fewshot_template_yaml      |  25 +++
 .../mmlu_abstract_algebra.yaml                |  40 ++++
 .../mmlu/flan_cot_fewshot/mmlu_anatomy.yaml   |  57 ++++++
 .../mmlu/flan_cot_fewshot/mmlu_astronomy.yaml |  54 +++++
 .../mmlu_business_ethics.yaml                 |  55 +++++
 .../mmlu_clinical_knowledge.yaml              |  58 ++++++
 .../mmlu_college_biology.yaml                 |  60 ++++++
 .../mmlu_college_chemistry.yaml               |  37 ++++
 .../mmlu_college_computer_science.yaml        | 189 ++++++++++++++++++
 .../mmlu_college_mathematics.yaml             |  49 +++++
 .../mmlu_college_medicine.yaml                |  52 +++++
 .../mmlu_college_physics.yaml                 |  70 +++++++
 .../mmlu_computer_security.yaml               |  35 ++++
 .../mmlu_conceptual_physics.yaml              |  32 +++
 .../flan_cot_fewshot/mmlu_econometrics.yaml   |  63 ++++++
 .../mmlu_electrical_engineering.yaml          |  34 ++++
 .../mmlu_elementary_mathematics.yaml          |  40 ++++
 .../flan_cot_fewshot/mmlu_formal_logic.yaml   |  57 ++++++
 .../flan_cot_fewshot/mmlu_global_facts.yaml   |  33 +++
 .../mmlu_high_school_biology.yaml             |  54 +++++
 .../mmlu_high_school_chemistry.yaml           |  49 +++++
 .../mmlu_high_school_computer_science.yaml    |  70 +++++++
 .../mmlu_high_school_european_history.yaml    | 168 ++++++++++++++++
 .../mmlu_high_school_geography.yaml           |  63 ++++++
 ...u_high_school_government_and_politics.yaml |  67 +++++++
 .../mmlu_high_school_macroeconomics.yaml      |  64 ++++++
 .../mmlu_high_school_mathematics.yaml         |  36 ++++
 .../mmlu_high_school_microeconomics.yaml      |  63 ++++++
 .../mmlu_high_school_physics.yaml             |  38 ++++
 .../mmlu_high_school_psychology.yaml          |  72 +++++++
 .../mmlu_high_school_statistics.yaml          |  88 ++++++++
 .../mmlu_high_school_us_history.yaml          | 133 ++++++++++++
 .../mmlu_high_school_world_history.yaml       |  82 ++++++++
 .../flan_cot_fewshot/mmlu_human_aging.yaml    |  48 +++++
 .../mmlu_human_sexuality.yaml                 |  61 ++++++
 .../mmlu_international_law.yaml               |  80 ++++++++
 .../flan_cot_fewshot/mmlu_jurisprudence.yaml  |  69 +++++++
 .../mmlu_logical_fallacies.yaml               |  71 +++++++
 .../mmlu_machine_learning.yaml                |  59 ++++++
 .../flan_cot_fewshot/mmlu_management.yaml     |  54 +++++
 .../mmlu/flan_cot_fewshot/mmlu_marketing.yaml |  66 ++++++
 .../mmlu_medical_genetics.yaml                |  61 ++++++
 .../flan_cot_fewshot/mmlu_miscellaneous.yaml  |  54 +++++
 .../flan_cot_fewshot/mmlu_moral_disputes.yaml |  72 +++++++
 .../mmlu_moral_scenarios.yaml                 |  66 ++++++
 .../mmlu/flan_cot_fewshot/mmlu_nutrition.yaml |  72 +++++++
 .../flan_cot_fewshot/mmlu_philosophy.yaml     |  30 +++
 .../flan_cot_fewshot/mmlu_prehistory.yaml     |  67 +++++++
 .../mmlu_professional_accounting.yaml         |  47 +++++
 .../mmlu_professional_law.yaml                | 105 ++++++++++
 .../mmlu_professional_medicine.yaml           |  69 +++++++
 .../mmlu_professional_psychology.yaml         |  47 +++++
 .../mmlu_public_relations.yaml                |  65 ++++++
 .../mmlu_security_studies.yaml                |  85 ++++++++
 .../mmlu/flan_cot_fewshot/mmlu_sociology.yaml |  67 +++++++
 .../mmlu_us_foreign_policy.yaml               |  66 ++++++
 .../mmlu/flan_cot_fewshot/mmlu_virology.yaml  |  55 +++++
 .../mmlu_world_religions.yaml                 |  53 +++++
 .../_mmlu_flan_generative_template_yaml       |  25 +++
 .../mmlu_abstract_algebra.yaml                |   8 +
 .../mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml  |   7 +
 .../flan_cot_zeroshot/mmlu_astronomy.yaml     |   7 +
 .../mmlu_business_ethics.yaml                 |   8 +
 .../mmlu_clinical_knowledge.yaml              |   8 +
 .../mmlu_college_biology.yaml                 |   8 +
 .../mmlu_college_chemistry.yaml               |   8 +
 .../mmlu_college_computer_science.yaml        |   8 +
 .../mmlu_college_mathematics.yaml             |   8 +
 .../mmlu_college_medicine.yaml                |   8 +
 .../mmlu_college_physics.yaml                 |   8 +
 .../mmlu_computer_security.yaml               |   8 +
 .../mmlu_conceptual_physics.yaml              |   8 +
 .../flan_cot_zeroshot/mmlu_econometrics.yaml  |   7 +
 .../mmlu_electrical_engineering.yaml          |   8 +
 .../mmlu_elementary_mathematics.yaml          |   8 +
 .../flan_cot_zeroshot/mmlu_formal_logic.yaml  |   8 +
 .../flan_cot_zeroshot/mmlu_global_facts.yaml  |   8 +
 .../mmlu_high_school_biology.yaml             |   8 +
 .../mmlu_high_school_chemistry.yaml           |   8 +
 .../mmlu_high_school_computer_science.yaml    |   8 +
 .../mmlu_high_school_european_history.yaml    |   8 +
 .../mmlu_high_school_geography.yaml           |   8 +
 ...u_high_school_government_and_politics.yaml |   8 +
 .../mmlu_high_school_macroeconomics.yaml      |   8 +
 .../mmlu_high_school_mathematics.yaml         |   8 +
 .../mmlu_high_school_microeconomics.yaml      |   8 +
 .../mmlu_high_school_physics.yaml             |   8 +
 .../mmlu_high_school_psychology.yaml          |   8 +
 .../mmlu_high_school_statistics.yaml          |   8 +
 .../mmlu_high_school_us_history.yaml          |   8 +
 .../mmlu_high_school_world_history.yaml       |   8 +
 .../flan_cot_zeroshot/mmlu_human_aging.yaml   |   8 +
 .../mmlu_human_sexuality.yaml                 |   8 +
 .../mmlu_international_law.yaml               |   8 +
 .../flan_cot_zeroshot/mmlu_jurisprudence.yaml |   7 +
 .../mmlu_logical_fallacies.yaml               |   8 +
 .../mmlu_machine_learning.yaml                |   8 +
 .../flan_cot_zeroshot/mmlu_management.yaml    |   7 +
 .../flan_cot_zeroshot/mmlu_marketing.yaml     |   7 +
 .../mmlu_medical_genetics.yaml                |   8 +
 .../flan_cot_zeroshot/mmlu_miscellaneous.yaml |   7 +
 .../mmlu_moral_disputes.yaml                  |   8 +
 .../mmlu_moral_scenarios.yaml                 |   8 +
 .../flan_cot_zeroshot/mmlu_nutrition.yaml     |   7 +
 .../flan_cot_zeroshot/mmlu_philosophy.yaml    |   7 +
 .../flan_cot_zeroshot/mmlu_prehistory.yaml    |   7 +
 .../mmlu_professional_accounting.yaml         |   8 +
 .../mmlu_professional_law.yaml                |   8 +
 .../mmlu_professional_medicine.yaml           |   8 +
 .../mmlu_professional_psychology.yaml         |   8 +
 .../mmlu_public_relations.yaml                |   8 +
 .../mmlu_security_studies.yaml                |   8 +
 .../flan_cot_zeroshot/mmlu_sociology.yaml     |   7 +
 .../mmlu_us_foreign_policy.yaml               |   8 +
 .../mmlu/flan_cot_zeroshot/mmlu_virology.yaml |   7 +
 .../mmlu_world_religions.yaml                 |   8 +
 .../_mmlu_flan_generative_template_yaml       |  18 ++
 .../_mmlu_flan_loglikelihood_template_yaml    |  12 ++
 .../flan_n_shot/mmlu_abstract_algebra.yaml    |   8 +
 .../tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml  |   7 +
 .../mmlu/flan_n_shot/mmlu_astronomy.yaml      |   7 +
 .../flan_n_shot/mmlu_business_ethics.yaml     |   8 +
 .../flan_n_shot/mmlu_clinical_knowledge.yaml  |   8 +
 .../flan_n_shot/mmlu_college_biology.yaml     |   8 +
 .../flan_n_shot/mmlu_college_chemistry.yaml   |   8 +
 .../mmlu_college_computer_science.yaml        |   8 +
 .../flan_n_shot/mmlu_college_mathematics.yaml |   8 +
 .../flan_n_shot/mmlu_college_medicine.yaml    |   8 +
 .../flan_n_shot/mmlu_college_physics.yaml     |   8 +
 .../flan_n_shot/mmlu_computer_security.yaml   |   8 +
 .../flan_n_shot/mmlu_conceptual_physics.yaml  |   8 +
 .../mmlu/flan_n_shot/mmlu_econometrics.yaml   |   7 +
 .../mmlu_electrical_engineering.yaml          |   8 +
 .../mmlu_elementary_mathematics.yaml          |   8 +
 .../mmlu/flan_n_shot/mmlu_formal_logic.yaml   |   8 +
 .../mmlu/flan_n_shot/mmlu_global_facts.yaml   |   8 +
 .../flan_n_shot/mmlu_high_school_biology.yaml |   8 +
 .../mmlu_high_school_chemistry.yaml           |   8 +
 .../mmlu_high_school_computer_science.yaml    |   8 +
 .../mmlu_high_school_european_history.yaml    |   8 +
 .../mmlu_high_school_geography.yaml           |   8 +
 ...u_high_school_government_and_politics.yaml |   8 +
 .../mmlu_high_school_macroeconomics.yaml      |   8 +
 .../mmlu_high_school_mathematics.yaml         |   8 +
 .../mmlu_high_school_microeconomics.yaml      |   8 +
 .../flan_n_shot/mmlu_high_school_physics.yaml |   8 +
 .../mmlu_high_school_psychology.yaml          |   8 +
 .../mmlu_high_school_statistics.yaml          |   8 +
 .../mmlu_high_school_us_history.yaml          |   8 +
 .../mmlu_high_school_world_history.yaml       |   8 +
 .../mmlu/flan_n_shot/mmlu_human_aging.yaml    |   8 +
 .../flan_n_shot/mmlu_human_sexuality.yaml     |   8 +
 .../flan_n_shot/mmlu_international_law.yaml   |   8 +
 .../mmlu/flan_n_shot/mmlu_jurisprudence.yaml  |   7 +
 .../flan_n_shot/mmlu_logical_fallacies.yaml   |   8 +
 .../mmlu_loglikelihood_abstract_algebra.yaml  |   8 +
 .../mmlu_loglikelihood_anatomy.yaml           |   7 +
 .../mmlu_loglikelihood_astronomy.yaml         |   7 +
 .../mmlu_loglikelihood_business_ethics.yaml   |   8 +
 ...mmlu_loglikelihood_clinical_knowledge.yaml |   8 +
 .../mmlu_loglikelihood_college_biology.yaml   |   8 +
 .../mmlu_loglikelihood_college_chemistry.yaml |   8 +
 ...oglikelihood_college_computer_science.yaml |   8 +
 ...mlu_loglikelihood_college_mathematics.yaml |   8 +
 .../mmlu_loglikelihood_college_medicine.yaml  |   8 +
 .../mmlu_loglikelihood_college_physics.yaml   |   8 +
 .../mmlu_loglikelihood_computer_security.yaml |   8 +
 ...mmlu_loglikelihood_conceptual_physics.yaml |   8 +
 .../mmlu_loglikelihood_econometrics.yaml      |   7 +
 ..._loglikelihood_electrical_engineering.yaml |   8 +
 ..._loglikelihood_elementary_mathematics.yaml |   8 +
 .../mmlu_loglikelihood_formal_logic.yaml      |   8 +
 .../mmlu_loglikelihood_global_facts.yaml      |   8 +
 ...mlu_loglikelihood_high_school_biology.yaml |   8 +
 ...u_loglikelihood_high_school_chemistry.yaml |   8 +
 ...kelihood_high_school_computer_science.yaml |   8 +
 ...kelihood_high_school_european_history.yaml |   8 +
 ...u_loglikelihood_high_school_geography.yaml |   8 +
 ...d_high_school_government_and_politics.yaml |   8 +
 ...likelihood_high_school_macroeconomics.yaml |   8 +
 ...loglikelihood_high_school_mathematics.yaml |   8 +
 ...likelihood_high_school_microeconomics.yaml |   8 +
 ...mlu_loglikelihood_high_school_physics.yaml |   8 +
 ..._loglikelihood_high_school_psychology.yaml |   8 +
 ..._loglikelihood_high_school_statistics.yaml |   8 +
 ..._loglikelihood_high_school_us_history.yaml |   8 +
 ...glikelihood_high_school_world_history.yaml |   8 +
 .../mmlu_loglikelihood_human_aging.yaml       |   8 +
 .../mmlu_loglikelihood_human_sexuality.yaml   |   8 +
 .../mmlu_loglikelihood_international_law.yaml |   8 +
 .../mmlu_loglikelihood_jurisprudence.yaml     |   7 +
 .../mmlu_loglikelihood_logical_fallacies.yaml |   8 +
 .../mmlu_loglikelihood_machine_learning.yaml  |   8 +
 .../mmlu_loglikelihood_management.yaml        |   7 +
 .../mmlu_loglikelihood_marketing.yaml         |   7 +
 .../mmlu_loglikelihood_medical_genetics.yaml  |   8 +
 .../mmlu_loglikelihood_miscellaneous.yaml     |   7 +
 .../mmlu_loglikelihood_moral_disputes.yaml    |   8 +
 .../mmlu_loglikelihood_moral_scenarios.yaml   |   8 +
 .../mmlu_loglikelihood_nutrition.yaml         |   7 +
 .../mmlu_loglikelihood_philosophy.yaml        |   7 +
 .../mmlu_loglikelihood_prehistory.yaml        |   7 +
 ...loglikelihood_professional_accounting.yaml |   8 +
 .../mmlu_loglikelihood_professional_law.yaml  |   8 +
 ...u_loglikelihood_professional_medicine.yaml |   8 +
 ...loglikelihood_professional_psychology.yaml |   8 +
 .../mmlu_loglikelihood_public_relations.yaml  |   8 +
 .../mmlu_loglikelihood_security_studies.yaml  |   8 +
 .../mmlu_loglikelihood_sociology.yaml         |   7 +
 .../mmlu_loglikelihood_us_foreign_policy.yaml |   8 +
 .../mmlu_loglikelihood_virology.yaml          |   7 +
 .../mmlu_loglikelihood_world_religions.yaml   |   8 +
 .../flan_n_shot/mmlu_machine_learning.yaml    |   8 +
 .../mmlu/flan_n_shot/mmlu_management.yaml     |   7 +
 .../mmlu/flan_n_shot/mmlu_marketing.yaml      |   7 +
 .../flan_n_shot/mmlu_medical_genetics.yaml    |   8 +
 .../mmlu/flan_n_shot/mmlu_miscellaneous.yaml  |   7 +
 .../mmlu/flan_n_shot/mmlu_moral_disputes.yaml |   8 +
 .../flan_n_shot/mmlu_moral_scenarios.yaml     |   8 +
 .../mmlu/flan_n_shot/mmlu_nutrition.yaml      |   7 +
 .../mmlu/flan_n_shot/mmlu_philosophy.yaml     |   7 +
 .../mmlu/flan_n_shot/mmlu_prehistory.yaml     |   7 +
 .../mmlu_professional_accounting.yaml         |   8 +
 .../flan_n_shot/mmlu_professional_law.yaml    |   8 +
 .../mmlu_professional_medicine.yaml           |   8 +
 .../mmlu_professional_psychology.yaml         |   8 +
 .../flan_n_shot/mmlu_public_relations.yaml    |   8 +
 .../flan_n_shot/mmlu_security_studies.yaml    |   8 +
 .../mmlu/flan_n_shot/mmlu_sociology.yaml      |   7 +
 .../flan_n_shot/mmlu_us_foreign_policy.yaml   |   8 +
 .../tasks/mmlu/flan_n_shot/mmlu_virology.yaml |   7 +
 .../flan_n_shot/mmlu_world_religions.yaml     |   8 +
 235 files changed, 5082 insertions(+), 6 deletions(-)
 create mode 100644 lm_eval/tasks/mmlu/_cot_prompts.json
 rename lm_eval/tasks/mmlu/{gen_all_splits.py => _generate_configs.py} (76%)
 rename lm_eval/tasks/mmlu/{ => default}/hendrycks_test_original_default.yaml (100%)
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml

diff --git a/lm_eval/tasks/mmlu/_cot_prompts.json b/lm_eval/tasks/mmlu/_cot_prompts.json
new file mode 100644
index 00000000..4714567a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/_cot_prompts.json
@@ -0,0 +1 @@
+{"abstract_algebra": "The following are multiple choice questions (with answers) about abstract algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. A cyclic group is a group that is generated by a single element. Hence a subgroup generated by a single element of a group is cyclic and Statement 1 is True. The answer is (C).\n\nQ: The symmetric group $S_n$ has $\nactorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\n(A) 0 (B) 3 (C) 12 (D) 30\nA: Let's think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).\n\nQ: Statement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 1 is true. Let $S$ be a finite set. If $f:S \nightarrow S$ is a onto function, then $|S| = |f(S)|$. If $f$ was not one to one, then for finite domain $S$ the image would have less than $S$ elements, a contradiction.\nStatement 2 is true. Let $G$ be an abelian group and $H$ be a subgroup of $G$. We need to show that $H$ is abelian. Let $a,b \\in H$. Then $a,b \\in G$ and $ab=ba$. Since $G$ is abelian, $ab=ba$. Since $H$ is a subgroup of $G$, $ab \\in H$. Therefore, $ab=ba$ and $H$ is abelian. The answer is (A).\n\nQ: Statement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 2 is false. Let $H$ be a subgroup of $S_3$ generated by the cycle $(1,2)$ and $K$ be a subgroup of $S_3$ generated by the cycle $(1,3)$. Both $H$ and $K$ have two elements, the generators and the identity. However $HK$ contains cycles (1,2), (1,3) and (2,3,1), but the inverse of (2,3,1) is (2,1,3) and it does not belong to HK, hence HK is not a subgroup. The answer is (B).\n\nQ: Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\n(A) 0 (B) 1 (C) 2 (D) 3\nA: Let's think step by step. Z_3[x]/(x^2 + c) is a field if and only if x^2 + c does not have roots in Z_3. That is x^2 + c != 0 for every x in Z_3. If c = 0, then x^2 + c = x^2 has root 0. If c = 1 then x^2 + c = x^2 + 1 = 0 + 1 for x = 0, 1 + 1 = 2 for x = 1 and 1 + 1 = 2 for x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2 has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only if c = 1. The answer is (B).", "anatomy": "The following are multiple choice questions (with answers) about anatomy.\n\nQ: Which of the following is the body cavity that contains the pituitary gland?\n(A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The pituitary gland is the major endocrine gland attached to the base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n(A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. \nWe know the following: (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve. (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal nerve. Because all these nerves are purely sensory nerves and do not contain any somatic motor processes. Therefore, the answer should be none of the above, which is (D). The answer is (D).\n\nQ: In Angle's Class II Div 2 occlusion there is\n(A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper central incisors. (C) excess overjet of the upper lateral incisors. (D) excess overjet of the upper central incisors.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. This is a question related to anatomy and orthodontics. Excess overjet is associated with Class II occlusions; therefore, we can safely eliminate (B) from the list, as negative overjet is often associated with Class III occlusions. Now, we need to determine the location of the excess overjet, and that would be the upper (maxillary) lateral incisors. Only (C) has the correct information. The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. First, recall that the pleura refers to the thin layer of tissue that covers the lungs and lines the interior wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201cThe pleura have no sensory innervation.\u201d This information is not correct. The pleura do have a sensory innervation.\nOption (B): \u201cThe pleura are separated by a 2 mm space.\u201d This information is not correct. There is a very thin \u201cpotential\u201d space between the layers of the pleura; however, it is typically filled with serous pleural fluid. \nOption (C): \u201cThe pleura extend into the neck.\u201d This information is actuakky true. The cervical pleura, also known as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural cavity into the neck.\nOption (D): \u201cThe pleura are composed of respiratory epithelium.\u201d This information is not correct. The pleaura are composed of connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the only correct answer. The answer is (C).\n\nQ: What is the embryological origin of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The hyoid bone, which is also known as the hyooid, is a a small U-shaped bone located in the anterior neck. In its resting position, it lies between the ase of the mandible and the third cervical vertebrae. We know that the second and the third pharyngeal arches give rise to the horns of the hyoid bone; therefore, the embryological origin of the hyoid bone are the second and the third pharyngeal arches\u2014this information is covered in the last option (D). Therefore, we conclude that (D) must be the correct answer. The answer is (D).", "astronomy": "The following are multiple choice questions (with answers) about astronomy.\n\nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the Kuiper belt. (C) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt. (D) The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud.\nA: Let's think step by step. Most short-period comets come from the Kuiper belt, and we know because short period coments tend to be in the plane of the solar system, just like the Kuiper belt is. The answer is (A).\n\nQ: You are pushing a truck along a road. Would it be easier to accelerate this truck on Mars? Why? (Assume there is no friction)\n(A) It would be harder since the truck is heavier on Mars. (B) It would be easier since the truck is lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D) It would be the same no matter where you are.\nA: Let's think step by step. If we assume that there is no friction, the force needed to accelerate the truck is by Newton\u2019s second law only dependent on the mass of the truck. Hence (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?\n(A) 10000 times more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think step by step. The amount of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic collision. (B) There was not enough material in this part of the solar nebula to form a planet. (C) There was too much rocky material to form a terrestrial planet but not enough gaseous material to form a jovian planet. (D) Resonance with Jupiter prevented material from collecting together to form a planet.\nA: Let's think step by step. The asteroid belt is a stellar disc consisting of a large number of asteroids between Mars and Jupiter's orbits. The asteroids in this belt are affected by the gravitational pull from both other asteroids and nearby planets. Due to the strong gravitational force of Jupiter there are resonances that give rise to low density regions of asteroids known as the Kirkwood gap. So (B) and (C) are not correct since it\u2019s not a lack of material that prevents a planet from being formed, and (A) is incorrect because the Kirkwood gap would have prevented a planet from forming in the first place, and (D) is the correct option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere scatters more light at bluer wavelengths transmitting mostly red light. (C) Because Mars is covered with ancient lava flows which are red in color. (D) Because flowing water on Mars's surface altered the surface minerals several billion years ago.\nA: Let's think step by step. Option (B) is not correct because if the red color was caused by the scattering off the atmosphere, then the earth with a much thicker atmosphere would also look red. Options (C) and (D) are not specific enough about why the color of the surface would be red, while (A) is correct because it explains that the surface is red due to the rusted materials on the surface and the red color comes from the rust. So the correct option is (A). The answer is (A).", "business_ethics": "The following are multiple choice questions (with answers) about business ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .\n(A) Buycotts, Boycotts, Blockchain technology, Charitable donations (B) Buycotts, Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased Sales\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cIn contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies. The success of such campaigns have been heightened through the use of *digital technology*, which allow campaigns to facilitate the company in achieving *increased sales*.\u201d The answer is (D).\n\nQ: _______ is the direct attempt to formally or informally manage ethical issues or problems, through specific policies, practices and programmes.\n(A) Corporate social responsibility (B) Business ethics management (C) Sustainability (D) Environmental management\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The direct attempt manage ethical issues through specific policies, practices, and programs is business ethics management. The answer is (B).\n\nQ: Three contrasting tactics that CSO's can engage in to meet their aims are ________ which typically involves research and communication, ________, which may involve physically attacking a company's operations or ________, often involving some form of _______.\n(A) Non-violent direct action, Violent direct action, Indirect action, Boycott (B) Indirect action, Instrumental action, Non-violent direct action, Information campaign (C) Indirect action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent direct action, Instrumental action, Indirect action, Information campaign\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cThree contrasting tactics that CSO's can engage in to meet their aims are *indirect action*, which typically involves research and communication, *violent direct action*, which may involve physically attacking a company's operations or *non-violent direct action*, often involving some form of *boycott*.\u201d The answer is (C).\n\nQ: To ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being drawn from _______ the company, being appointed for a _________ time period as well as being appointed _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cTo ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being draw from *outside* the company, being appointed for a *limited* time period as well as being imported *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging in CSR there are a number of moral arguments relating to: negative _______, the _______that corporations possess and the ________ of business and society.\n(A) Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual dependence\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cBeyond the business case for engaging the CSR there are a number of moral arguments relating to: negative *externalities*, the *power* that corporations possess and the *mutual independence* of business and society. The answer is (D).", "clinical_knowledge": "The following are multiple choice questions (with answers) about clinical knowledge.\n\nQ: Glycolysis is the name given to the pathway involving the conversion of:\n(A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl CoA.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Glycolysis is the name given to the pathway involving conversion of glycogen or glucose to pyruvate or lactate. The answer is (C).\n\nQ: What is the difference between a male and a female catheter?\n(A) Male and female catheters are different colours. (B) Male catheters are longer than female catheters. (C) Male catheters are bigger than female catheters. (D) Female catheters are longer than male catheters.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The difference between a male and female catheter is that male catheters tend to be longer than female catheters. The answer is (B).\n\nQ: How many attempts should you make to cannulate a patient before passing the job on to a senior colleague, according to the medical knowledge of 2020?\n(A) 4 (B) 3 (C) 2 (D) 1\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. According to the medical protocol as of 2020, you should make two attempts to cannulate a patient before passing the job on to a more-senior practitioner. The answer is (C).\n\nQ: In the assessment of the hand function which of the following is true?\n(A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied by the median nerve (D) Finger abduction is mediated by the palmar interossei\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Of all the options, it is only true that the opposition of the thumb by opponens pollicis is supplied by spinal root T1. The answer is (B).\n\nQ: The energy for all forms of muscle contraction is provided by:\n(A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate), which is the powerhouse of the cell. The answer is (A).", "college_biology": "The following are multiple choice questions (with answers) about college biology.\n\nQ: Which of the following represents an accurate statement concerning arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan. (B) They possess an open circulatory system with a dorsal heart. (C) They are members of a biologically unsuccessful phylum incapable of exploiting diverse habitats and nutrition sources. (D) They lack paired, jointed appendages.\nA: Let's think step by step. Peptidoglycan is known to comprise the plasma membrane of most bacteria, rather than the exoskeleton of arthropods, which is made of chitin, which rules out (A). The answer (C) is false because arthropods are a highly successful phylum. Likewise, arthropods have paired, jointed appendages, which rules out (D). The only remaining option is (B), as arthropods have an open circulatory system with a dorsal tubular heart. The answer is (B).\n\nQ: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?\n(A) 1/400 (B) 19/400 (C) 20/400 (D) 38/400\nA: Let's think step by step. According to the Hardy Weinberg Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$, and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and heterozygous individuals, respectively. \u200bThe frequency of the recessive allele (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The number of heterozygous individuals is equal to the frequency of heterozygous individuals times the size of the population, or $0.095 * 400 = 38$. So we end up with 38/400. The answer is (D).\n\nQ: According to the pressure-flow model of movement of phloem contents, photosynthate movement from source to sink is driven by\n(A) an ATP-dependent pressure-flow pump (B) a water-pressure potential gradient (C) transpiration (D) apoplastic diffusion\nA: Let's think step by step. It is a gradient in water pressure that induces the movement of phloem content, which refers to answer (B). The mechanism of movement does not rely on metabolism, which rules out (A). Transpiration refers to the exhalation of water vapor through plant stomata, and is also not related, which rules out (C). While the apoplastic pathway is one of two main pathways for water transport in plants, it is not central to the pressure flow model, which rules out (D). The answer is (B).\n\nQ: Which of the following contain DNA sequences required for the segregation of chromosomes in mitosis and meiosis?\n(A) Telomeres (B) Centromeres (C) Nucleosomes (D) Spliceosomes\nA: Let's think step by step. The genetic material in Telomeres is not used, which rules out (A). Nucleosomes are the repeating subunit that comprises chromatin packed in a cell nucleus, and do not specifically refer to DNA sequences necessary for segregating chromosomes in cell division, which rules out (C). A spliceosome is a large ribonucleoprotein that removes introns from transcribed pre-mRNA rather than governing chromosome segregation. Centromeres are directly responsible for segregating chromosomes in cell division. The answer is (B).\n\nQ: The presence of homologous structures in two different organisms, such as the humerus in the front limb of a human and a bird, indicates that\n(A) the human and bird are polyphyletic species (B) a human's and bird's evolution is convergent (C) the human and bird belong to a clade (D) the human and bird developed by analogy\nA: Let's think step by step. Polyphyletic species are organisms that are grouped due to having similar characteristics but which do not have a common ancestor. This is not the case for humans and birds, which rules out (A). Convergent evolution refers to the indepdendent development of similar features in different species at different periods, which is also not the case for humans and birds, which rules out (B). Analogy refers to the superficial resemblance of structures that have different origins, which is not the case for the human and bird forearms, which rules out (D). Humans and birds do belong to the same clade - a group of organisms composed of a common ancestor. The answer is (C).", "college_chemistry": "The following are multiple choice questions (with answers) about college chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq) + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n(A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step by step. A molecule that behaves as a base accepts an H+ ion (or proton) from another molecule, whereas a molecule that behaves as an acid donates an H+ ion (or proton) to another molecule. Neither of these is the case for Cl in this reaction, which rules out (A) and (B). A catalyst is a substance that only accelerates a reaction without itself undergoing chemical change, which is not the case here. This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which they donate in the reaction to form 3 HClO2. This is the behavior of a reducing agent, or (D). The answer is (D).\n\nQ: Which of the following statements about the lanthanide elements is NOT true?\n(A) The most common oxidation state for the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate hydrogen. (D) The atomic radii of the lanthanide elements increase across the period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide elements in fact decrease across the period from La to Lu. Options (A), (B), and (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\nQ: Which of the following lists the hydrides of group-14 elements in order of thermal stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14 hydrides decreases as we move from the top of group 14 to the bottom. The order of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step by step. The electron paramagnetic resonance spectrum will be split by two forms of interactions. The first is the hyperfine interaction with the 13C (nuclear spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will be further split into 4 lines by the interaction with three equivalent 1H nuclei. The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E).", "college_computer_science": "The following are multiple choice questions (with answers) about college computer science.\n\nQ: Which of the following regular expressions is equivalent to (describes the same set of strings as) (a* + b)*(c + d)?\n(A) a*(c + d)+ b(c + d)\n(B) a*(c + d)* + b(c + d)*\n(C) a*(c + d)+ b*(c + d)\n(D) (a + b)*c +(a + b)*d\nA: Let's think step by step. We know that:\n1. (X* + Y)* = (X + Y)*\n2. X(Y + Z)? = XY + XZ\nUsing equation 1 we can rewrite (a* + b)*(c + d)? as:\n3. (a + b)*(c + d)?\nUsing equation 2 we can rewrite equation 3 as:\n(a + b)*c + (a + b)*d The answer is (D).\n\nQ: The Singleton design pattern is used to guarantee that only a single instance of a class may be instantiated. Which of the following is (are) true of this design pattern?\nI. The Singleton class has a static factory method to provide its instance.\nII. The Singleton class can be a subclass of another class.\nIII. The Singleton class has a private constructor.\n(A) I only\n(B) II only\n(C) III only\n(D) I, II, and III\nA: Let's think step by step. Statement I is a correct statement about a Singleton, because a Singleton restricts instantiation to a single, static method. Statement II is also correct, because there is no inherent restriction regarding the inheritance of a Singleton. Statement III is also correct, because a Singletons must be instantiated only once, so its constructor is made private to prevent any construction except via its static factory method.\nGiven these facts, statements I, II, and III are all correct. The answer is (D).\n\nQ: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . . . , R7 and supports the following operations:\nADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)\nMUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)\nAn operation normally takes one cycle; however, an operation takes two cycles if it produces a result required by the immediately following operation in an operation sequence.\nConsider the expression AB + ABC + BC, where variables A, B, C are located in registers R0, R1, R2. If the contents of these three registers must not be modified, what is the minimum number of clock cycles required for an operation sequence that computes the value of AB + ABC + BC?\n(A) 5 (B) 6 (C) 7 (D) 8\nA: Let's think step by step. First, we are given that A is in R0, B is in R1, and C is in R2.\nNext, we can see that we must compute three multiplies (AB, BC, and ABC) and two adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum of five clock cycles.\nNext, we can see that there is no way to avoid at least one pipeline stall when computing our final answer, because to compute our final sum we must wait at least one cycle for the results from the previous stage to be ready. Thus, our minimum number of cycles must be 6.\nWe can verify that we can create a solution that requires only six cycles as follows:\ncompute AB: MUL R0, R1, R3\ncompute BC: MUL R1, R2, R4\ncompute ABC: MUL R3, R4, R5\ncompute AB + BC: ADD R3, R4, R6\nSTALL\ncompute AB + ABC + BC: ADD R5, R6, R7\nSo there are 6 cycles. The answer is (B).\n\nQ: A compiler generates code for the following assignment statement.\nG := (A + B) * C - (D + E) * F\nThe target machine has a single accumulator and a single-address instruction set consisting of instructions load, store, add, subtract, and multiply. For the arithmetic operations, the left operand is taken from the accumulator and the result appears in the accumulator. The smallest possible number of instructions in the resulting code is\n(A) 5 (B) 6 (C) 7 (D) 9\nA: Let's think step by step. We can compute the final answer with the following sequence of operations:\n1. LOAD D  (accumulator = D)\n2. ADD E  (accumulator = D+E)\n3. MUL F  (accumulator = (D+E)*F)\n4. STORE X (X = (D+E)*F)\n5. LOAD A  (accumulator = A)\n6. ADD B  (accumulator = A+B)\n7. MUL C  (accumulator = (A+B)*C)\n8. SUB X  (accumulator = (A+B)*C - (D+E)*F)\n9. STORE G (G = (A+B)*C - (D+E)*F)\nThis sequence takes 9 instructions. The answer is (D).\n\nQ: Consider a computer design in which multiple processors, each with a private cache memory, share global memory using a single bus. This bus is the critical system resource. Each processor can execute one instruction every 500 nanoseconds as long as memory references are satisfied by its local cache. When a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds. During half of this additional delay, the bus is dedicated to serving the cache miss. During the other half, the processor cannot continue, but the bus is free to service requests from other processors. On average, each instruction requires 2 memory references. On average, cache misses occur on 1 percent of references. What proportion of the capacity of the bus would a single processor consume, ignoring delays due to competition from other processors?\n(A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27\nA: Let's think step by step. We know that each instruction requires two memory references per instruction, and that there is an average cache miss rate of one percent.\nThus a given processor has:\n(1 cache miss / 100 references) * (2 references / instruction) =\n(2 cache misses / 100 instructions), so:\nmisses_per_instruction = 1 cache miss / 50 instructions.\nNext, we know that each instruction requires 500 nanoseconds when there is no cache miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:\n50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:\ninstructions_per_ns = 50 instructions / 27000 nanoseconds.\nNow, we know that each cache miss locks the bus for half of the 2000 nanosecond cache miss delay, or 1000 nanoseconds, so:\nlock_ns_per_miss = 1000 nanoseconds / cache miss.\nThus we can see that on average a single processor will lock the bus for:\nlock_ns_per_miss * misses_per_instruction * instructions_per_ns =\n(1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer is (B).", "college_mathematics": "The following are multiple choice questions (with answers) about college mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST = T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x) + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x) + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11 (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\nA: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ: Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n(A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably. We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100 liters of liquid in the tank, containing $s$ grams of salt. So the density of salt in the tank is $s / 100$ grams per liter. The flow of water out of the tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\frac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that, for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then, for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100 / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D).", "college_medicine": "The following are multiple choice questions (with answers) about college medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Creatine supplementation is a dietary supplement that results in body mass gain. The answer is (B).\n\nQ: Which of the following is not a true statement?\n(A) Muscle glycogen is broken down enzymatically to glucose-1-phosphate (B) Elite endurance runners have a high proportion of Type I fibres in their leg muscles (C) Liver glycogen is important in the maintenance of the blood glucose concentration (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step and go over each choice: \n(A) \u201cMuscle glycogen is broken down enzymatically to glucose-1-phosphate\u201d: This is a correct statement.\n(B) \u201cElite endurance runners have a high proportion of Type I fibres in their leg muscles\u201d: This is a correct statement.\n(C) \u201cLiver glycogen is important in the maintenance of the blood glucose concentration\u201d: This is a correct statement. \n(D) \u201cInsulin promotes glucose uptake by all tissues in the body\u201d: This is not a correct statement, because insulin promotes glucose uptake by the liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues in the brain and red blood cells are not affected by insulin. The answer is (D).\n\nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70 atm, and the room temperature is 25\u00b0C. Which two variables will both increase the pressure of the system, if all other variables are held constant?\n(A) Increasing temperature, increasing moles of gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase the pressure of the system (P), then either n (number of moles of the gas) or T (temperature) have to increase. The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder is found that has X-linked recessive transmission. Which of the following statements is likely true regarding the pedigree of this disorder?\n(A) All descendants on the maternal side will have the disorder. (B) Females will be approximately twice as affected as males in this family. (C) All daughters of an affected male will be affected. (D) There will be equal distribution of males and females affected.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step. Let's recall first that females have two X chromosomes, while males have one X and one Y chromosome. This is an important fact we need to know before answering this question. \nBecause a male can only pass his only one X chromosome to a daughter, if he is affected by this rare genetic disorder, then we know for sure that he will pass this rare genetic disorder to all his future-born daughters. Therefore, \u201c(C): All daughters of an affected male will be affected\u201d is a correct statement. The answer is (C).\n\nQ: Glucose is transported into the muscle cell:\n(A) via protein transporters called GLUT4. (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Glucose (also known as the blood sugar) is the main sugar found in the human body. It is transported into the muscle cell via diffusion through protein transporters called GLUT4. The answer is (A).", "college_physics": "The following are multiple choice questions (with answers) about college physics.\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of\n(A) charge (B) mass (C) energy and momentum (D) lepton number\nA: Let's think step by step. Lepton number must be conserved, meaning the total number of leptons minus the number of antileptons. If a muon decays into an electron and a single neutrino, the total lepton number would go from one to two, violating lepton number conservation. The answer is (D).\n\nQ: One end of a Nichrome wire of length 2L and cross-sectional area A is attached to an end of another Nichrome wire of length L and cross- sectional area 2A. If the free end of the longer wire is at an electric potential of 8.0 volts, and the free end of the shorter wire is at an electric potential of 1.0 volt, the potential at the junction of the two wires is most nearly equal to\n(A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V\nA: Let's think step by step. This is a simple voltage divider problem, where the longer wire has a resistance four times that of the shorter end. So the voltage divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: For which of the following thermodynamic processes is the increase in the internal energy of an ideal gas equal to the heat added to the gas?\n(A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic\nA: Let's think step by step. Heat added to the gas can go into the gases internal energy or work done against an external force. However, if the volume of the gas container is constant, no work will be done (since work is pressure times change in volume). So, at constant volume, all of the heat goes into the internal energy. The answer is (B).", "computer_security": "The following are multiple choice questions (with answers) about computer security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628 bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function which takes an input and produces a 160-bit (20-byte) hash value, its message digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on your system \u2013 so that your system doesn\u2019t run correctly or you can no longer access specific data, or it may even ask for ransom in order to give your access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan\nA: Let's think step by step. The system is asking for trojans, which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities so they can be fixed, rather than exploited (C) Hacking into systems run by those whose ethics you disagree with (D) A slang term for rapid software development, e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities in an application, system, or organization's infrastructure that an attacker can use to exploit an individual or organization. They use this process to prevent cyberattacks and security breaches by lawfully hacking into the systems and looking for weak points. The answer is (B).\n\nQ: The ____________ is anything which your search engine cannot search.\n(A) Haunted web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step. The search engine searches on the Surface Web, which is the portion of the world wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond to an internet concept. The Deep Web is the part of the World Wide Web which is not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n(A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a read outside bounds of a buffer (D) a format string attack\nA: Let's think step by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL cryptographic software library. Heartbleed resulted from improper input validation (due to a missing bounds check) in the implementation of the TLS heartbeat extension. The vulnerability was classified as a buffer over-read, a situation where more data can be read than should be allowed. The answer is (C).", "conceptual_physics": "\nThe following are multiple choice questions (with answers) about conceptual physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different frequency (B) deflection (C) interference (D) polarization\nA: Let's think step by step. In a soap bubble film, the light bounces between the two soap-air interfaces many times, interfering with itself constructively or destructively depending on the width of the film. This results in different colors being visible. The answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission, the combined masses of the products after fission are\n(A) less (B) more (C) the same (D) zero\nA: Let's think step by step. Fission releases energy, which comes from the rest mass of its initial nucleus. Thus the mass of the products is less than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things that are equivalent according to the equivalence principle are\n(A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration. (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence principle states that gravity and acceleration are equivalent. The answer is (C).\n\nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear binding energy, the mass of an atomic nucleus is less than the sum of individual masses of the free constituent protons and neutrons; this is known as the mass defect. Hydrogen has no mass defect because it has only a single nucleon, so it will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane flies slower when flying into the wind and faster with wind at its back. When launched at right angles to the wind a cross wind its groundspeed compared with flying in still air is\n(A) the same (B) greater (C) less (D) either greater or less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed in the direction of the wind is greater than it would be in the absence of wind, and its direction orthogonal to the wind is the same as it would be in the absence of the wind. The total speed, which is these two components added in quadrature, is thus greater than the speed in still air. The answer is (B).", "econometrics": "The following are multiple choice questions (with answers) about econometrics.\n\nQ: Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bi-variate VAR, and the values of the determinant of the variance-covariance matrix of residuals are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What is the optimal model order according to Akaike's information criterion?\n(A) 1 lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. First of all, let\u2019s recall that for a given set of data, Akaike's information criterion (AIC) allows us to measure how well a statistical model fits the data; it is an estimator of prediction error. Here in this problem we will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike\u2019s criterion, where ln denotes the natural log function, det the determinant function, k the total number of parameters in total (across both equations), and T the number of observations (which, in this case, is equal to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags, it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500) = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) = ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084) + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the information criterion, the answer should be the one with the lowest value. In this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2 + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. If we have a an AR(1) model with the disturbances having zero mean and unit variance, then the unconditional mean of y is equal to the following:\nunconditional mean of y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\nunconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately 0.33. That means that the answer should be (D) 0.33. The answer is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored?\n(A) It will be biased (B) It will be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Heteroscedasticity refers to the condition where the variance of the error terms is not constant across multiple observations. If heteroscedasticity is present in a regression model, then the coefficient estimates in the OLS estimator will be not only unbiased and consistent but also inefficient. Because (A) and (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true?\n(i) If the size of the test were exactly 8%, we would be indifferent between rejecting and not rejecting the null hypothesis\n(ii) The null would be rejected if a 10% size of test were used\n(iii) The null would not be rejected if a 1% size of test were used\n(iv) The null would be rejected if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n(iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii) are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. This is a formal logic problem about stationally process. For a stationary autoregressive process, shocks will eventually die away. The answer is (A).", "electrical_engineering": "\nThe following are multiple choice questions (with answers) about electrical engineering.\n\nQ: A point pole has a strength of 4\u03c0 * 10^-4 weber. The force in newtons on a point pole of 4\u03c0 * 1.5 * 10^-4 weber placed at a distance of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's think step by step. The force between two point poles is given by m_1m_2/(mu_0 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in the question, we calculate that the force is approximately 15 N. The answer is (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's think step by step. The torque on a coil in a uniform magnetic field is given by BANI, where B is the magnetic flux density, A is the area of the coil, N is the number of turns, and I is the current. So we have that I = (Torque)/(BAN), or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset latch; in the case where S=1 and R=1, the circuit has no stable state; instead a race condition will be produced within the circuit, so the device will be in an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ: Two long parallel conductors carry 100 A. If the conductors are separated by 20 mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r), where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\nQ: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03a9 and total number of conductors is 100. Find the total resistance\n(A) 200\u03a9 (B) 100\u03a9 (C) 50\u03a9 (D) 10\u03a9\nA: Let's think step by step. In lap winding, effectively two resistors are connected in parallel, so the actual resistance of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50 Ohms. The answer is (C).", "elementary_mathematics": "The following are multiple choice questions (with answers) about elementary mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n(A) The 10th number in the pattern will be an even number.\n(B) The number pattern will never have two even numbers next to each other.\n(C) The next two numbers in the pattern will be an even number then an odd number.\n(D) If the number pattern started with an odd number then the pattern would have only odd numbers in it.\nA: Let's think step by step. Choice A is incorrect because every even-numbered term in the pattern is odd, and 10 is an even number. Choice B is correct, because adding an odd number (in this case 11) to an odd number produces an even number, and adding an odd number to an even number produces an odd number. Thus the terms in the pattern will alternate between odd and even, so there will never be two even numbers next to each other. Choice C is incorrect because the last term in the example is even (54), and we know that the terms will alternate between even and odd. Choice D is incorrect because the terms in the pattern will alternate between odd and even, regardless of the value of the first term. The answer is (B).\n\nQ: The population of the city where Michelle was born is 145,826. What is the value of the 5 in the number 145,826?\n(A) 5 thousands\n(B) 5 hundreds\n(C) 5 tens\n(D) 5 ones\nA: Let's think step by step. Choice A is correct, because there are three digits following the 5, so\nthe 5 is in the thousands place. Thus the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different colors of paint. They have 25 cans of each color in storage. The number of cans of paint the store has in storage can be found using the expression below. 107 \u00d7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B) 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107 x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n(C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\nA: Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n(B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5 x 4). The answer is (B).", "formal_logic": "The following are multiple choice questions (with answers) about formal logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the following sentence?\nTurtles live long lives and are happy creatures, unless they are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022 (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cL\u201d denote \u201cliving long\u201d, H \u201cbeing happy\u201d, and \u201cI\u201d \u201cbeing injured\u201d. Now, consider each choice:\n(A) means (living long AND being happy) is equivalent to (being injured). \n(B) means (living long AND being happy) OR (being injured). \n(C) means (living long) AND (being happy OR being injured). \n(D) means (living long) AND (being happy implies being R), but what R denotes is not clear.\nObviously, (B) is the best symbolization of the original sentence. The answer is (B).\n\nQ: Select the best translation into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. We are told that \u201cBxyx\u201d means \u201cx borrows y from z\u201d. We can rewrite \u201cGeorge borrows Hector's lawnmower\u201d as \u201cGeorge borrows a lawnmower from Hector\u201d, which can then be translated into predicate logic as \u201cBglh\u201d. The answer \u201cBglh\u201d appears in (C); therefore, (C) must be the correct answer. The answer is (C).\n\nQ: \nSelect the best English interpretation of the given arguments in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball. (B) Marina is a dancer. No weakling is a dancer. Everything is either a weakling or plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some weaklings are not dancers. Everything is either a weakling or plays volleyball. So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cD\u201d denote \u201cbeing a dancer\u201d, \u201cm\u201d denote \u201cMaria\u201d, \u201cg\u201d denote \u201cGeorgia\u201d, \u201cW\u201d denote \u201cweakling\u201d, \u201cA\u201d denote \u201cplaying volleyball\u201d. Then, we have the following:\n1. Dm \u2192 Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling, then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball. So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that some weaklings are not dancers, but the second argument strongly states that no weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important detail about Georgia playing volleyball. Option (D) has all the details presented in the arguments and is the best English interpretation of the arguments. The answer is (D).\n\nQ: Select the best translation into predicate logic: No people drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx) (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cP\u201d denote \u201cbeing on Mars\u201d and \u201cD\u201d denote \u201cdriving on Mars\u201d. Then let\u2019s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these options, Option (C) appears to be the best and most meaningful interpretation of the argument \u201cNo people drive on Mars.\u201d The answer is (C).", "global_facts": "The following are multiple choice questions (with answers) about global facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. According to data published by the World Health Organization, the nummber of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\nQ: As of 2019, about what percentage of Americans agree that the state is run for the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. In 2019, about 46% percentage of Americans agree that the state is run for the benefit of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage of Russians say it is very important to have free media in our country without government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, about 38% of Russians say it is very important to have free media in our country. The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.\n(A) increased, increased (B) increased, decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2015, since 1990 forests have increased in Europe and have decreased in Africa and the Americas. The answer is (B).\n\nQ: Which of the following pairs of statements are both true (as of 2019)?\n(A) People tend to be optimistic about their own future and the future of their nation or the world. (B) People tend to be optimistic about their own future but pessimistic about the future of their nation or the world. (C) People tend to be pessimistic about their own future but optimistic about the future of their nation or the world. (D) People tend to be pessimistic about their own future and the future of their nation or the world.\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, most people tend to be optimistic about their own future but pessimistic about the future of their nation or the world. The answer is (B).", "high_school_biology": "The following are multiple choice questions (with answers) about high school biology.\n\nQ: In animal cells, which of the following represents the most likely pathway that a secretory protein takes as it is synthesized in a cell?\n(A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the endoplasmic reticulum and moves from there to the Golgi apparatus, where it is modified and packaged into a vesicle. The vesicle then floats to the plasma membrane and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed a previously polar amino acid into a nonpolar amino acid. This amino acid was located at a site distant from the enzyme\u2019s active site. How might this mutation alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing the shape of the protein (D) An amino acid change away from the active site cannot alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A change in an amino acid leads to a change in the primary structure of the protein. A change in the primary structure may lead to a change in the secondary and the tertiary structure of the protein. A change in the tertiary structure means a change in the shape of the protein, so (C) has to be correct. Since the change does not affect the active site of the enzyme, we do not expect the activity of the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation\nA: Let's think step by step. The introduction of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique in molecular biology and scientific research. There are multiple ways foreign DNA can be introduced into cells including transformation, transduction, conjugation, and transfection. In contrast, (A) is not a way to form DNA: during translation the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous structures are often cited as evidence for the process of natural selection. All of the following are examples of homologous structures EXCEPT\n(A) the wings of a bird and the wings of a bat (B) the flippers of a whale and the arms of a man (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200b\u200bHomologous structures are similar physical features in organisms that share a common ancestor \u200b\u200bbut different functions. Comparisons (B) and (C) are clearly homologous because they share a common ancestor and the structures serve different purposes. Bat wings and birg wings are also homologous, while they are both wings, the forelimbs serve different purposes. Insects and dogs are very far ancestors since one is vertebrate while the other is invertebrate and the forelimbs serve the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of the following is not known to be involved in the control of cell division?\n(A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think step by step. Normal cells move through the cell cycle in a regulated way. At the checkpoint stage, they use information about their own internal state and cues from the environment around them to decide whether to proceed with cell division. Cues like these act by changing the activity of core cell cycle regulators inside the cell. The most common regulators are cyclins and cyclin-dependent kinases. Fibroblast cells do not play any role in cell division. The answer is (D).", "high_school_chemistry": "The following are multiple choice questions (with answers) about high school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n(A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride is a compound that is derived by removing water from an acid. The chemical formula for water is H2O, which means that we need to determine which of these options, when combined with H2O, forms an acid. SO2, or Sulfur dioxide, when combined with H2O, makes H2SO4, or sulfuric acid. The answer is (C).\n\nQ: Which of the following is expected to be a polar molecule?\n(A) PCl4F (B) BF3 (C) CO2 (D) Si(CH3)4\nA: Let's think step by step. A polar molecule is one that has a slightly positive charge on one end of the molecule and a slightly negative charge on the other end. Boron trifluoride (BF3) has Boron as the center atom and three fluorine atoms attached to it; it is trigonal planar and symmetric, so it is nonpolar. Carbon Dioxide (CO2) has Carbon as the central atom with double bonds to two Oxygen atoms - this is also symmetrical and therefore nonpolar. The same is the case for tetramethyl silane (SI(CH3)4), which is a Silicon atom surrounded by four methyl groups. The structure of PCL4F is that Phosphorus is the central atom, attached to four chlorines and one fluorine atom. This is asymmetrical, and therefore has a net dipole and is expected to be a polar molecule. The answer is (A).\n\nQ: From the solubility rules, which of the following is true?\n(A) All chlorides, bromides, and iodides are soluble (B) All sulfates are soluble (C) All hydroxides are soluble (D) All ammonium-containing compounds are soluble\nA: Let's think step by step. The chlorides, bromides, and iodides of lead, silver, and mercury are not soluble in water. This rules out (A). The sulfates of lead, barium, and calcium are not soluble in water, which rules out (B). The hydroxides of any metal besides sodium, potassium, ammonium, calcium, and barium are insoluble. This rules out (C). Typically ammonium ions indicate a soluble ionic substance. The answer is (D).\n\nQ: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?\n(A) 3.89 (B) 7.78 (C) 5.78 (D) 2.33\nA: Let's think step by step. Recall that $[A] = [H^{+}]$. Here, this is equal to $$10^{-3.89}$. Then we have $K_{a} = $\nrac{[H^{+}][A^{-}]}{[HA]} = \nrac{10^{-3.89} \\cdot 10^{-3.89}}{10^{-2}}. The resulting exponent is $-3.89 + (-3.89) - (-2) = 5.78$, therefore $K_a = 10^{-5.78}$. The $pK_a$ is the negative log of $K_a$, which is equal to $5.78$. The answer is (C).\n\nQ: A solution contains 2.00 mole of acetic acid, CH3COOH, and 1.00 mole of calcium acetate, Ca(CH3COO)2. The solution is able to resist the addition of a small amount of strong acid or strong base with only minor changes in the pH of the solution. Larger quantities of strong acid or strong base can cause a significant change in pH. How many moles of nitric acid, HNO3, may be added before the pH begins to change significantly?\n(A) 0.500 mole (B) 1.00 mole (C) 2.00 mole (D) 3.00 mole\nA: Let's think step by step. We would like to compute the buffer capacity of this solution. First we write the equation for the ionization of the weak acid, in this case of acetic acid. $CH_{3}COOH (aq) + H_{2}O \nightarrow H_{3}O^{+} + CH3COO^{-}$. The conjugate base is therefore the acetate ion. The added strong acid, Nitric acid, will react with the conjugate base. Therefore the maximum amount of acid that can be added will be equal to the amount of acetate ion, or 2 moles. The answer is (C).", "high_school_computer_science": "The following are multiple choice questions (with answers) about high school computer science.\n\nQ: Which of the following is an example of the use of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that it is about to hit an object. (B) A hiker uses a G P S watch to keep track of her position. (C) A refrigerator orders milk from an online delivery service when the milk in the refrigerator is almost gone. (D) A runner uses a watch with optical sensors to monitor his heart rate.\nA: Let's think step by step. The term Internet of Things (IoT) refers to common devices which are connected to the internet, enabling new functionality. Choice A is incorrect because it does not describe an internet connected device. In choice B, the watch is only described as having GPS functionality but no internet connectivity. Choice C describes a common device (a refrigerator) which has internet connectivity enabling new functionality (online ordering). Choice D does not mention internet connectivity for the watch, only optical sensors. The answer is (C).\n\nQ: Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?\n(A) The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator. (B) Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer. (C) A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session. (D) A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.\nA: Let's think step by step. Choice A is incorrect as it only describes network traffic, which an anonymous browser does not change. Choice B is correct as it correctly describes how an anonymous browser will prevent saving data on the user\u2019s computer after the session is ended. Choice C is incorrect because an anonymous browser will not prevent logging in to email or social media accounts. Choice D is incorrect because an anonymous browser in itself performs no virus protection. The answer is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n  DISPLAY (\"Hotel\")\n } ELSE {\n  IF (Y > 0){\n   DISPLAY (\"November\")\n  } ELSE {\n   DISPLAY (\"Yankee\")\n  }\n }\n}\nWhat is displayed as a result of running the program?\n(A) Foxtrot (B) Hotel (C) November (D) Yankee\nA: Let's think step by step. Because X has the value 5, the first conditional IF (X < 0) is false, so we move to the first ELSE clause. Because X is 5 and Y is 10, the second conditional IF (X > Y) is false, so we move to the following ELSE clause. Since Y is 10, the conditional IF (Y > 0) is true, so the command DISPLAY (\"November\") is executed. The answer is (C).\n\nQ: What is the output of \"abc\"[::-1] in Python 3?\n(A) Error (B) abc (C) cba (D) c\nA: Let's think step by step. We know that the slicing operator [::-1] takes all of the elements in the string in reverse order, so we reverse the order of the string \"abc\", resulting in \"cba\". The answer is (C).\n\nQ: A list of numbers has n elements, indexed from 1 to n. The following algorithm is intended to display the number of elements in the list that have a value greater than 100. The algorithm uses the variables count and position. Steps 3 and 4 are missing.\n Step 1: Set count to 0 and position to 1.\n Step 2: If the value of the element at index position is greater than 100, increase the value of count by 1.\n Step 3: (missing step)\n Step 4: (missing step)\n Step 5: Display the value of count.\nWhich of the following could be used to replace steps 3 and 4 so that the algorithm works as intended?\n(A) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of count is greater than 100.\n(B) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of position is greater than n.\n(C) Step 3: Repeat step 2 until the value of count is greater than 100.\n  Step 4: Increase the value of position by 1.\n(D) Step 3: Repeat step 2 until the value of position is greater than n.\n  Step 4: Increase the value of count by 1.\nA: Let's think step by step. Choice A is incorrect, because its Step 4 has an incorrect termination condition, stopping when count is greater than 100. We need to stop after inspecting all elements in the list. Choice B is correct because it correctly increments both count and position, and correctly repeats these steps and terminates when all elements in the list have been inspected. Choice C is incorrect because it incorrectly increments the variable count until its value is greater than 100, regardless of the elements in the list. Choice D is incorrect because its step 3 does not increment the value of position, so it will repeat forever. The answer is (B).", "high_school_european_history": "The following are multiple choice questions (with answers) about high school european history.\n\nQ: This question refers to the following information.\nAlbeit the king's Majesty justly and rightfully is and ought to be the supreme head of the Church of England, and so is recognized by the clergy of this realm in their convocations, yet nevertheless, for corroboration and confirmation thereof, and for increase of virtue in Christ's religion within this realm of England, and to repress and extirpate all errors, heresies, and other enormities and abuses heretofore used in the same, be it enacted, by authority of this present Parliament, that the king, our sovereign lord, his heirs and successors, kings of this realm, shall be taken, accepted, and reputed the only supreme head in earth of the Church of England, called Anglicans Ecclesia; and shall have and enjoy, annexed and united to the imperial crown of this realm, as well the title and style thereof, as all honors, dignities, preeminences, jurisdictions, privileges, authorities, immunities, profits, and commodities to the said dignity of the supreme head of the same Church belonging and appertaining; and that our said sovereign lord, his heirs and successors, kings of this realm, shall have full power and authority from time to time to visit, repress, redress, record, order, correct, restrain, and amend all such errors, heresies, abuses, offenses, contempts, and enormities, whatsoever they be, which by any manner of spiritual authority or jurisdiction ought or may lawfully be reformed, repressed, ordered, redressed, corrected, restrained, or amended, most to the pleasure of Almighty God, the increase of virtue in Christ's religion, and for the conservation of the peace, unity, and tranquility of this realm; any usage, foreign land, foreign authority, prescription, or any other thing or things to the contrary hereof notwithstanding.\nEnglish Parliament, Act of Supremacy, 1534\nFrom the passage, one may infer that the English Parliament wished to argue that the Act of Supremacy would\n(A) give the English king a new position of authority (B) give the position of head of the Church of England to Henry VIII alone and exclude his heirs (C) establish Calvinism as the one true theology in England (D) end various forms of corruption plaguing the Church in England\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Act of Supremacy states that it grants authority to the king \"to repress and extirpate all errors, heresies, and other enormities and abuses\", referring to the corruption in the Church of England. The answer is (D).\n\nQ: This question refers to the following information.\nRead the following excerpt.\nThe revolutionary seed had penetrated into every country and spread more or less. It was greatly developed under the r\u00e9gime of the military despotism of Bonaparte. His conquests displaced a number of laws, institutions, and customs; broke through bonds sacred among all nations, strong enough to resist time itself; which is more than can be said of certain benefits conferred by these innovators.\nThe monarchs will fulfil the duties imposed upon them by Him who, by entrusting them with power, has charged them to watch over the maintenance of justice, and the rights of all, to avoid the paths of error, and tread firmly in the way of truth. Placed beyond the passions which agitate society, it is in days of trial chiefly that they are called upon to despoil realities of their false appearances, and to show themselves as they are, fathers invested with the authority belonging by right to the heads of families, to prove that, in days of mourning, they know how to be just, wise, and therefore strong, and that they will not abandon the people whom they ought to govern to be the sport of factions, to error and its consequences, which must involve the loss of society.\nUnion between the monarchs is the basis of the policy which must now be followed to save society from total ruin. . . .\nLet them not confound concessions made to parties with the good they ought to do for their people, in modifying, according to their recognized needs, such branches of the administration as require it.\nLet them be just, but strong; beneficent, but strict.\nLet them maintain religious principles in all their purity, and not allow the faith to be attacked and morality interpreted according to the social contract or the visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\nWhich of the following was the greatest cause of the fears expressed by Metternich in the document above?\n(A) The ideas of personal liberty and nationalism conceived during the Enlightenment resulted in radical revolutions that could spread throughout Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions and shifted the European balance of power. (C) The power of monarchs had grown to the point where it needed to be checked by other powers within each nation or domination of civilians would occur. (D) The rising and falling economic cycle of the newly emerging capitalist economy could lead to civilian unrest that must be suppressed.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The fears of revolution in early 19th century Europe expressed by Klemens von Metternich, a conservative Austrian statesman, were a direct result of the age of Enlightenment, a period of European history where the absolute power of the monarchy was challenged with ideas of individual liberty and nationalism, leading to the French revolution and its effects all over Europe. The answer is (A).\n\nQ: This question refers to the following information.\nThe excerpts below are from the Navigation Acts of 1651.\n[A]fter the first day of December, one thousand six hundred fifty and one, and from thence forwards, no goods or commodities whatsoever of the growth, production or manufacture of Asia, Africa or America, or of any part thereof; or of any islands belonging to them, or which are described or laid down in the usual maps or cards of those places, as well of the English plantations as others, shall be imported or brought into this Commonwealth of England, or into Ireland, or any other lands, islands, plantations, or territories to this Commonwealth belonging, or in their possession, in any other ship or ships, vessel or vessels whatsoever, but only in such as do truly and without fraud belong only to the people of this Commonwealth, or the plantations thereof, as the proprietors or right owners thereof; and whereof the master and mariners are also of the people of this Commonwealth, under the penalty of the forfeiture and loss of all the goods that shall be imported contrary to this act, , , ,\n[N]o goods or commodities of the growth, production, or manufacture of Europe, or of any part thereof, shall after the first day of December, one thousand six hundred fifty and one, be imported or brought into this Commonwealth of England, or any other lands or territories to this Commonwealth belonging, or in their possession, in any ship or ships, vessel or vessels whatsoever, but in such as do truly and without fraud belong only to the people of this Commonwealth, and in no other, except only such foreign ships and vessels as do truly and properly belong to the people of that country or place, of which the said goods are the growth, production or manufacture.\nWhich of the following best describes the outcome of the Navigation Acts of 1651?\n(A) They served as a catalyst for the growth of English shipping and overseas trade, but did little to limit the prospects of the Dutch in the seventeenth century. (B) They brought about almost immediate hardships for the Dutch economy as their dominance of overseas trade quickly ended. (C) They were rescinded during the restoration of the Stuarts as they sought normal diplomatic relations with the Dutch so not as to need Parliament's financial support for war. (D) They led to nearly a century of recurrent war between England and the Netherlands, which would not end until after American independence.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Navigation Acts of 1651 helped English shipping by restricting the ability of ships from other European countries, especially the Dutch, to transport goods from colonies in Asia and Africa into England. The answer is (A).\n\nQ: This question refers to the following information.\nIn Russia there was nothing going on well, and [Souvarine] was in despair over the news he had received. His old companions were all turning to the politicians; the famous Nihilists who made Europe tremble-sons of village priests, of the lower middle class, of tradesmen-could not rise above the idea of national liberation, and seemed to believe that the world would be delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get out of it with their foolery.\"\nThen, lowering his voice still more, in a few bitter words he described his old dream of fraternity. He had renounced his rank and his fortune; he had gone among workmen, only in the hope of seeing at last the foundation of a new society of labour in common. All the sous in his pockets had long gone to the urchins of the settlement; he had been as tender as a brother with the colliers, smiling at their suspicion, winning them over by his quiet workmanlike ways and his dislike of chattering. But decidedly the fusion had not taken place.\nHis voice changed, his eyes grew bright, he fixed them on \u00e9tienne, directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles who have won the great lottery prize of a hundred thousand francs have gone off at once and invested it, declaring that they are going to live without doing anything! Yes, that is your idea, all of you French workmen; you want to unearth a treasure in order to devour it alone afterwards in some lazy, selfish corner. You may cry out as much as you like against the rich, you haven't got courage enough to give back to the poor the money that luck brings you. You will never be worthy of happiness as long as you own anything, and your hatred of the bourgeois proceeds solely from an angry desire to be bourgeois yourselves in their place.\"\n\u00e9mile Zola, French writer, Germinal, 1885\nThe passage displays the direct concern for the welfare of the working classes that was typically a part of which movement?\n(A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The modern Communist movement aims to establish a classless society based on communal ownership and distribution of property and means of production, thereby especially benefiting the working classes. The answer is (C).\n\nQ: This question refers to the following information.\nThe following excerpt is from a pamphlet.\nYou will do me the justice to remember, that I have always strenuously supported the Right of every man to his own opinion, however different that opinion might be to mine. He who denies to another this right, makes a slave of himself to his present opinion, because he precludes himself the right of changing it.\nThe most formidable weapon against errors of every kind is Reason. I have never used any other, and I trust I never shall.\nThe circumstance that has now taken place in France of the total abolition of the whole national order of priesthood, and of everything appertaining to compulsive systems of religion, and compulsive articles of faith, has not only precipitated my intention, but rendered a work of this kind exceedingly necessary, lest in the general wreck of superstition, of false systems of government, and false theology, we lose sight of morality, of humanity, and of the theology that is true.\nI believe in one God, and no more; and I hope for happiness beyond this life.\nI believe in the equality of man; and I believe that religious duties consist in doing justice, loving mercy, and endeavoring to make our fellow-creatures happy.\nI do not believe in the creed professed by the Jewish church, by the Roman church, by the Greek church, by the Turkish church, by the Protestant church, nor by any church that I know of. My own mind is my own church.\nAll national institutions of churches, whether Jewish, Christian or Turkish, appear to me no other than human inventions, set up to terrify and enslave mankind, and monopolize power and profit.\nI do not mean by this declaration to condemn those who believe otherwise; they have the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed a system of checks and balances for government to avoid abuses of power?\n(A) Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively against the monoplization of power and advocated for a system of checks and balances in government to prevent the rise of despotism. The answer is (B).", "high_school_geography": "The following are multiple choice questions (with answers) about high school geography.\n\nQ: Which one of the following items is an example of nonmaterial culture?\n(A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not physical objects. The answer is (C).\n\nQ: During the third stage of the demographic transition model, which of the following is true?\n(A) Birth rates increase and population growth rate is less rapid. (B) Birth rates decline and population growth rate is less rapid. (C) Birth rates increase and population growth rate increases. (D) Birth rates decrease and population growth rate increases.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The demographic transition model models the five different stages of population growth as a country goes through economic development, where the third stage refers to a period of declining birth rates and lower population growth. The answer is (B).\n\nQ: The practice of hiring a foreign third-party service provider to run an operation is called\n(A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. \"Offshoring\" literally means to move or base some of the activities or processes of a company to a foreign country. The answer is (B).\n\nQ: Which of the following statements is NOT accurate regarding the services provided by local governments in the United States?\n(A) Duplication of efforts occurs often. (B) Social problems of the central city spill over into the surrounding residential suburbs. (C) Inefficiency in providing services occurs often. (D) One neighborhood's efforts to reduce pollution are always supported by neighboring communities.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. There may be economic, social or political reasons for two neighboring communities and their local governments not agreeing to pollution reduction efforts initiated by one of them. The answer is (D).\n\nQ: The rate of natural increase of a population is found by subtracting the\n(A) crude death rate from the crude birth date. (B) crude birth rate from the crude death rate. (C) doubling time from the crude birth rate. (D) fertility rate from the crude death rate.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The difference between number of births and deaths gives the population increase at any given time. The answer is (A).", "high_school_government_and_politics": "The following are multiple choice questions (with answers) about high school government and politics.\n\nQ: Which of the following best states an argument made by James Madison in The Federalist number 10?\n(A) Honest politicians can prevent factions from developing. (B) Factions are more likely to occur in large republics than in small ones. (C) The negative effects of factionalism can be reduced by a republican government. (D) Free elections are the people's best defense against factionalism.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the Federalist number 10, James Madison advocated for a representative republican form of government to guard against factionalism. The answer is (C).\n\nQ: The term \"budget deficit\" refers to the\n(A) annual increase in federal spending on the military (B) amount of interest on the national debt (C) difference between the initial budget proposals made by the president and Congress (D) amount the government spends in excess of its revenues\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. When the goverment spends more than it earns, their difference is the budget deficit. The answer is (D).\n\nQ: Which of the following statements about cabinet departments is FALSE?\n(A) They are established by the legislative branch. (B) Their members often don't have much influence over presidential decisions. (C) They cannot all be run by leaders who belong to the same political party the president does. (D) Not every federal agency is a cabinet department.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. There is no law stipulating that some cabinet department leaders have to belong to a political party different from that of the president. The answer is (C).\n\nQ: Which of the following cases established the precedent that a defendant must be informed of the right to remain silent, the right to a lawyer, and protection from self-incrimination?\n(A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant's right to an attorney and protection from self-incrimination. The answer is (D).\n\nQ: Uncertainty over the limits to presidential power is caused primarily by the fact that\n(A) the constitutional definition of those powers is broad and unspecific (B) most people agree that the Constitution places too many limits on presidential power (C) the Supreme Court consistently refuses to rule on cases concerning presidential powers (D) constitutional amendments have greatly increased presidential powers\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. The US Constitution is not very specific about the powers of the president, leading to uncertainty over its limits. The answer is (A).", "high_school_macroeconomics": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\nQ: Which of the following policies best describes supply-side fiscal policy?\n(A) An increase in the money supply (B) Increased government spending (C) Lower taxes on research and development of new technology (D) Higher taxes on household income\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Supply-side fiscal policy stimulates the economy by encouraging more production of goods and services through reduction in taxes and deregulation. The answer is (C).\n\nQ: The short-run Phillips curve indicates a\n(A) direct relation between unemployment and inflation (B) direct relation between price and quantity demanded (C) inverse relation between price and quantity demanded (D) inverse relation between unemployment and inflation\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The short-run Phillips curve shows that whenever unemployment decreases below a natural level, the inflation starts increasing, and vice-versa. The answer is (D).\n\nQ: Holding all else equal which of the following monetary policies would be used to boost U.S. exports?\n(A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government securities (D) Lowering tariffs\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Buying government securities leads to reduction in demand for US dollars from foreign buyers, thereby making it cheaper and hence making US exports more attractive. The answer is (C).\n\nQ: A federal deficit occurs when\n(A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections exceed spending. (D) federal spending exceeds federal tax revenues.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. A federal deficit occurs when federal spending exceeds federal income which is primarily from tax revenues. The answer is (D).\n\nQ: Which of the following is not included in the U.S. GDP?\n(A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel. (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An American pop singer performs a sold-out concert in Paris. (D) A French theatrical production tours dozens of American cities.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The economic transactions related to the performance of the American pop-singer in Paris happens entirely outside the U.S. and hence is not included in the GDP numbers. The answer is (C).", "high_school_mathematics": "The following are multiple choice questions (with answers) about high school mathematics.\n\nQ: Simplify and write the result with a rational denominator: $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step. Factoring $729=3^6$ and combining the roots $\\frac{1}{2}\\frac{1}{3}\\frac{1}{2}=\\frac{1}{12}$, we get that $\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}=\\left(\\frac{1}{3^6}\\right)^{\\frac{1}{12}}=\\frac{1}{3^{\\frac{1}{2}}}=\\frac{3}{\\sqrt{3}}$ The answer is (D).\n\nQ: Five thousand dollars compounded annually at an $x\\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\\$300$ to grow to $\\$9600$?\n(A) 12 (B) 1 (C) 30 (D) 5\nA: Let's think step by step. To go from $\\$300$ to $\\$9600$, the value must go up by a factor of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\nQ: Ten students take a biology test and receive the following scores: 45, 55, 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n(A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85 = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies directly as the square of $y$, and $y$ varies directly as the cube of $z$. If $x$ equals $-16$ when $z$ equals 2, what is the value of $x$ when $z$ equals $\\frac{1}{2}$?\n(A) -1 (B) 16 (C) -\\frac{1}{256} (D) \\frac{1}{16}\nA: Let's think step by step. We know that $x \\propto y^2$ and $y \\propto z^3$, so $x = k z^6$ for some constant $k$. Plugging in for $x=-16$ and $z=2$, the constant value is $k=\\frac{x}{z^6}=\\frac{-16}{64}=-\\frac{1}{4}$. So, when $z=\\frac{1}{2}$, the value of $x$ is $x=kz^6=-\\frac{1}{4}\\frac{1}{2^6}=-\\frac{1}{256}$. The answer is (C).\n\nQ: Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights will come on at the same time $2*7+1=15$ times. The answer is (B).", "high_school_microeconomics": "The following are multiple choice questions (with answers) about high school microeconomics.\n\nQ: Which of the following is necessarily a characteristic of oligopoly?\n(A) Free entry into and exit from the market (B) A few large producers (C) One producer of a good with no close substitutes (D) A homogenous product\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An oligopoly is when a market is dominated by just one or a few number of sellers or producers. To get oligopoly, the market should have high barriers to new entry, and the product has differentiation. The answer is (B).\n\nQ: If the government subsidizes producers in a perfectly competitive market, then\n(A) the demand for the product will increase (B) the demand for the product will decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. (A) and (B) are wrong because the demand curve does not change at all. If the government subsidizes producers, the supply will increase, and thus the consumer surplus also increases. The answer is (C).\n\nQ: Which of the following is true of a price floor?\n(A) The price floor shifts the demand curve to the left. (B) An effective floor creates a shortage of the good. (C) The price floor shifts the supply curve of the good to the right. (D) To be an effective floor, it must be set above the equilibrium price.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. Price floor does not shift the demand or shift curve. An effective price floor should be set above the equilibrium price, otherwise the market bears and the floor does not have effective effect. The answer is (D).\n\nQ: The concentration ratio for a monopoly is\n(A) 0 (B) 5 (C) 10 (D) 100\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. The concentration ratio is calculated as the sum of market share of a specific number of largest companies. Monopoly means one company or entity controls the entire market, therefore, the concentration ratio is 100 percent. The answer is (D).\n\nQ: In a competitive labor market for housepainters, which of the following would increase the demand for housepainters?\n(A) An effective minimum wage imposed on this labor market. (B) An increase in the price of gallons of paint. (C) An increase in the construction of new houses. (D) An increase in the price of mechanical painters so long as the output effect exceeds the substitution effect.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An increase in the construction of new houses means an increase demand of in-house painting, thus increases the demand for housepainters. The answer is (C).", "high_school_physics": "The following are multiple choice questions (with answers) about high school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?\n(A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy usage is known as power; in an dissipative electrical circuit, power is given by voltage times current. So in our case, the power is 120 V times 2 amps, or 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin. How much work is required to move a charge, Q = +8 \u00b5C, from the point (0, 4 meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\nA: Let's think step by step. To calculate the work required to move a charge from one location to another in a fixed electric field, it is enough to calculate the potential difference between the two locations. Here, the potential only depends on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following conditions will ensure that angular momentum is conserved? I. Conservation of linear momentum II. Zero net external force III. Zero net external torque\n(A) I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think step by step. Torque is defined as the change in angular momentum; if there is zero external torque, angular momentum is conserved. The answer is (D).\n\nQ: A photocell of work function \u03d5 = 2eV is connected to a resistor in series. Light of frequency f = 1 \u00d7 10^15 Hz hits a metal plate of the photocell. If the power of the light is P = 100 W, what is the current through the resistor?\n(A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The only answer above which has units of current is D, 24 A. The answer is (D).\n\nQ: A pipe full of air is closed at one end. A standing wave is produced in the pipe, causing the pipe to sound a note. Which of the following is a correct statement about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure is at a node, but the particle displacement is at an antinode. (B) The pressure is at an antinode, but the particle displacement is at a node. (C) The pressure and the particle displacement are both at nodes. (D) The pressure and the particle displacement are both at antinodes.\nA: Let's think step by step. At the closed end of the pipe, the particles cannot have any net displacement because the pipe closure stops them. So the particle displacement is at a node. This closure also causes the pressure to be maximal, i.e. an antinode. The answer is (B).", "high_school_psychology": "The following are multiple choice questions (with answers) about high school psychology.\n\nQ: Pascale is interested in the processing strategies children use to learn new information. Pascale would best be classified as what type of psychologist?\n(A) sociocultural (B) clinical (C) cognitive (D) behaviorist\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Sociocultural psychologist focuses on the effect of societal factors on people. Clinical psychologist focuses on people with mental issues. Cognitive psychologist focuses on how people think and learn, including the processing strategies. Behaviorist focuses more on the environment and experience effect on people. The answer is (C).\n\nQ: According to Caplan's model of consultee-centered case consultation, the consultant is primarily interested in\n(A) identifying the causes and solutions of the client's presenting problems (B) identifying and eliminating the causes of the consultee's difficulties in handling a problem (C) establishing a hierarchy of authority to enable effective decision making (D) presenting a single, well-defined and unambiguous course of action for the consultant to overcome skills deficits\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Caplan defines two type of consultation. Client-centered case consultation aims to handle client's problems, while consultee-centered case consultation aims to identify the reason of client's difficulty to solve problems. The answer is (B).\n\nQ: According to the Individuals with Disabilities Education Improvement Act, which of the following must an educational agency do before it changes the educational placement of a student with a disability?\n(A) Give the child a trial period in the new environment (B) Notify the parents in writing (C) Obtain school board approval (D) Obtain parental consent\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. When the decision to change the educational placement of a student with a disability is made, the educational agency must notify the parents in writing on that date. The answer is (B).\n\nQ: While swimming in the ocean, Ivan is frightened by a dark shadow in the water even before he has the chance to identify what the shadow is. The synaptic connections taking place during this incident of fright are best described by which of the following?\n(A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are sent from the thalamus to the \"what\" and \"where\" pathways. (C) Messages are sent from the parasympathetic nervous system to the cerebral cortex. (D) Messages are sent from the frontal lobes to the pituitary gland.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Our neural system has a mechanism that can respond immediate emotional signal before going to the thought center. In the Ivan's case, messages travel directly from thalamus to amygdala. The answer is (A).\n\nQ: Ani believes that her attitudes and behavior play a central role in what happens to her. Such a belief is likely to be associated with\n(A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal locus of control.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. People with an external locus of control believes fate and luck play an important role in their lives, while people with an internal locus of control believes they control their lives. The answer is (D).", "high_school_statistics": "The following are multiple choice questions (with answers) about high school statistics.\n\nQ: A new smartwatch is manufactured in one part of a factory, then secured for shipping in another, independent part of the factory. The weight of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging would have the following mean and standard deviation:\n(A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation 3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard deviation 6.1 grams\nA: Let's think step by step. Since the weight of the watch and the weight of the packaging are independent random variables, the mean and variance of their sum is equal to the sum of their individual means and variances. So the mean is 62 + 456 = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation of 6.1 grams. The answer is (C).\n\nQ: After a frost warning was issued, the owner of a large orange grove asked his workers to spray all his trees with water. The water was supposed to freeze and form a protective covering of ice around the orange blossom. Nevertheless, the owner suspected that some trees suffered considerable damage due to the frost. To estimate the proportion of trees that suffered more than 50 percent damage due to the frost, he took a random sample of 100 trees from his grove. What is the response variable in this experiment?\n(A) The proportion of trees that suffered more than 50 percent damage due to frost. (B) The number of trees affected by the frost. (C) The number of trees sampled from the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage or at most 50 percent damage.\nA: Let's think step by step. In this experiment, the response variable is what is measured. For each tree, what is measured is whether or not it suffered more than 50 percent damage due to the frost. The answer is (D).\n\nQ: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What are the expected value and variance of the random variable X + Y?\n(A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X + Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.\nA: Let's think step by step. While means of sums of random variables add (regardless of whether the variables are independent) in order to determine the variance of a sum of random variables, we need to know not just their individual variances but the covariance of the two variables, which is not given in this problem. The answer is (D).\n\nQ: Which of the following sets has the smallest standard deviation? Which has the largest?\nI: {1,2,3}\nII: {-10,10}\nIII: {100}\n(A) I, II (B) II, III (C) III, I (D) III, II\nA: Let's think step by step. The variance of distribution I is the expected squared deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution II is 10^2 (because both elements are 10 away from the mean of zero). The variance of distribution III is 0, since it has a single entry. So distribution III has the smallest standard deviation and distribution II has the largest. The answer is (D).\n\nQ: Which of the following is a correct statement about correlation?\n(A) If the slope of the regression line is exactly 1, then the correlation is exactly 1. (B) If the correlation is 0, then the slope of the regression line is undefined. (C) Switching which variable is called x and which is called y changes the sign of the correlation. (D) The correlation r is equal to the slope of the regression line when z-scores for the y-variable are plotted against z-scores for the x-variable.\nA: Let's think step by step. Statement A is false because the slope of the regression line being exactly 1 can occur even when the two variables are not perfectly correlated. Statement B is false because uncorrelated variables regression lines can have slope zero. Statement C is false because correlation is symmetric in the two random variables. The answer is (D).", "high_school_us_history": "The following are multiple choice questions (with answers) about high school us history.\n\nQ: This question refers to the following information.\nI come not to urge personal claims, nor to seek individual benefits; I appear as the advocate of those who cannot plead their own cause; I come as the friend of those who are deserted, oppressed, and desolate. In the Providence of God, I am the voice of the maniac whose piercing cries from the dreary dungeons of your jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses. I am the Revelation of hundreds of wailing, suffering creatures, hidden in your private dwellings, and in pens and cabins\u2014shut out, cut off from all healing influences, from all mind-restoring cares.\u2026 Could their melancholy histories be spread before you as revealed to my grieved spirit during the last three months, how promptly, how earnestly would you search out the most approved means of relief; how trifling, how insignificant, by comparison, would appear the sacrifices you are asked to make; how would a few dimes and dollars, gathered from each citizen, diminish in value as a possession, compared with the certain benefits and vast good to be secured for the suffering insane...by the consecration and application of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\nDorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C) Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Both Dorothea Dix and Clara barton are American nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation & a great diversity of Sentiments in this and neighboring Governments; some vindicating & others condemning it; some charitably alleviating the Crime, & others maliciously painting it in the most odious & detestable Colours, we think it our duty to lay before the Publick, the whole Matter as it appeared, & still appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an utter insensibility to human Distresses, let us consider a few more recent Facts. When we found the last Summer that we were likely to get no Assistance from the Government, some Volunteers went out at our own Expense, determined to drive our Enemies from our Borders; & when we came near to the great Island, we understood that a Number of their Warriors had gone out against our Frontiers. Upon this we returned and came up with them and fought with them at the Munfey Hill where we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers from this Story in another Expedition. But no sooner had we destroyed their Provisions on the great Island, & ruined their trade with the good People at Bethlehem, but these very Indians, who were justly suspected of having murdered our Friends in Northampton County, were by the Influence of some Quakers taken under the Protection of the Government to screen them from the Resentments of the Friends and Relations of the Murdered, & to support them thro the Winter.\"\n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this context should be read as an explanation, not an admission of guilt or regret.\nThe sentiments expressed in the explanation above reflect which of the ongoing tensions during the colonial period of American history?\n(A) Tensions between British policies and the aspirations of North American colonists. (B) Tensions between American Indians allied with the French and those allied with the British. (C) Tensions between freed African Americans and white planters. (D) Tensions between backcountry settlers and elites within colonial America.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. After the French and Indian War, the Scotch-Irish settlers attacked American Indians. After the attacks on the Conestoga, about 250 Paxton Boys present their grievances to the Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited resentiment at local elites. The answer is (D).\n\nQ: This question refers to the following information.\nOur leaders talk about stopping aggression from the north, but this was a struggle among groups of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill them and demolish their country to do it. As the native people survey bombed-out villages, women and children burned by napalm, rice crops destroyed and cities overrun with our military personnel, they are doubtless saying secretly of the Vietcong guerillas and of the American forces, \"A plague on both your houses.\" \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps, and confine our military action to holding operations on the ground. Bombing the north has failed to halt or seriously check the flow of troops to the south and may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following opinions from the 1960s most directly reflects the perspective of George McGovern's speech?\n(A) Americans must maximize their technological edge in Vietnam. (B) American bombing in Vietnam is step by step leading to progress in the war. (C) American bombing in Vietnam is a failure. (D) America must not give in to defeatism about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. \"Stop the bombing\" and \"Bombing the north has failed to halt or seriously check the flow of troops to the south\" indicate that the perspective of George McGovern's speech is that Amerian bombing in Vietnam is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\"In the new Code of Laws which I suppose it will be necessary for you to make I desire you would Remember the Ladies, and be more generous and favorable to them than your ancestors. Do not put such unlimited power into the hands of the Husbands. Remember all Men would be tyrants if they could. If particular care and attention is not paid to the Ladies we are determined to foment a Rebellion, and will not hold ourselves bound by any Laws in which we have no voice, or Representation.\"\nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman has placed us in a most anomalous position. Women invested with the rights of citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary line, are subjects in the next. In some States, a married woman may hold property and transact business in her own name; in others, her earnings belong to her husband. In some States, a woman may testify against her husband, sue and be sued in the courts; in others, she has no redress in case of damage to person, property, or character. In case of divorce on account of adultery in the husband, the innocent wife is held to possess no right to children or property, unless by special decree of the court. But in no State of the Union has the wife the right to her own person, or to any part of the joint earnings of the co-partnership during the life of her husband. In some States women may enter the law schools and practice in the courts; in others they are forbidden. In some universities girls enjoy equal educational advantages with boys, while many of the proudest institutions in the land deny them admittance, though the sons of China, Japan and Africa are welcomed there. But the privileges already granted in the several States are by no means secure.\"\nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments expressed in the second excerpt by Susan B. Anthony are most likely in support of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D) prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. The above information mentioned that women are in an anomalous position in terms of legislation. Women's earnings do not belong to themselves, or they cannot testify against her husbands. Susan believes women should have equal legal rights as men. The answer is (B).\n\nQ: This question refers to the following information.\n\"Society in every state is a blessing, but government even in its best state is but a necessary evil; in its worst state an intolerable one; for when we suffer, or are exposed to the same miseries by a government, which we might expect in a country without government, our calamity is heightened by reflecting that we furnish the means by which we suffer. Government, like dress, is the badge of lost innocence; the palaces of kings are built on the ruins of the bowers of paradise. For were the impulses of conscience clear, uniform, and irresistibly obeyed, man would need no other lawgiver; but that not being the case, he finds it necessary to surrender up a part of his property to furnish means for the protection of the rest; and this he is induced to do by the same prudence which in every other case advises him out of two evils to choose the least. Wherefore, security being the true design and end of government, it unanswerably follows that whatever form thereof appears most likely to ensure it to us, with the least expense and greatest benefit, is preferable to all others.\"\nThomas Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Anti-Federalists do not believe centralized government power, and suspect Washington's military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion happen before the Revolution and they can be ruled out. The answer is (C).", "high_school_world_history": "The following are multiple choice questions (with answers) about high school world history.\n\nQ: This question refers to the following information.\n\"At least one of the [world's] societies would have to somehow enormously increase its productivity [in order to achieve global hegemony]. That quantum jump would have to be made before the various scientific, technological, agricultural, and industrial revolutions on which our post-quantum-leap world rests. It could only be accomplished by exploiting the ecosystems, mineral resources, and human assets of whole continents outside the lands of the society making the jump. Western Europe did just that by means of its brutality and guns and, more important, by geographical and ecological luck.\"\nCopyright \u00a9 2015 Cambridge University Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\" mentioned in the passage most directly contributed to which of the following developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes through the collapse of the established state structure (B) An increase in the population of the world through more plentiful supplies of food (C) The spread of Chinese and Indian belief systems across the world (D) An increase in social unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The \"quantum jump\" mentioned in the passage refers to the conquest of the New World and the Columbian Exchange. Choice (A) and (C) did not happen in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This question refers to the following information.\n\"The struggle against neo-colonialism is not aimed at excluding the capital of the developed world from operating in less developed countries. It is aimed at preventing the financial power of the developed countries being used in such a way as to impoverish the less developed.\nNon-alignment, as practiced by Ghana and many other countries, is based on co-operation with all States whether they be capitalist, socialist or have a mixed economy. Such a policy, therefore, involves foreign investment from capitalist countries, but it must be invested in accordance with a national plan drawn up by the government of the non-aligned State with its own interests in mind. The issue is not what return the foreign investor receives on his investments\u2026The question is one of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage expresses a point that the successful fight against neo-colonialism were in danger and the newly independent nations like Ghana may be re-colonized via financial power of the developed countries. The answer is (B).\n\nQ: This question refers to the following information.\n\"Indeed, as both the fatwas of distinguished [scholars] who base their opinion on reason and tradition alike and the consensus of the Sunni community agree that the ancient obligation of extirpation, extermination, and expulsion of evil innovation must be the aim of our exalted aspiration, for \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces evil innovation into our order must be expelled\" and \"Whosoever does aught against our order must be expelled,\" action has become necessary and exigent\u2026\"\nLetter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from Selim I is most clearly an example of which of the following?\n(A) The maintenance of military supremacy at all costs (B) Expanding tensions between religious sects (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage is an example of expanding tensions between Selim and Ismail. In the passage the Selim references the fatwa and the consensus of the Sunni community to against whosoever introduces evil. The answer is (B).\n\nQ: This question refers to the following information.\n\"The real grievance of the worker is the insecurity of his existence; he is not sure that he will always have work, he is not sure that he will always be healthy, and he foresees that he will one day be old and unfit to work. If he falls into poverty, even if only through a prolonged illness, he is then completely helpless, exam_ins to his own devices, and society does not currently recognize any real obligation towards him beyond the usual help for the poor, even if he has been working all the time ever so faithfully and diligently. The usual help for the poor, however, leaves a lot to be desired, especially in large cities, where it is very much worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely made this speech in reaction to which of the following issues?\n(A) Social acceptance of child labor (B) Declining life expectancy in Germany (C) Criticisms of German trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage talks about the grievance of the work under the industrial capitalism. The answer is (D).\n\nQ: This question refers to the following information.\nHe contains all works and desires and all perfumes and all tastes. He enfolds the whole universe and in silence is loving to all. This is the Spirit that is in my heart, this is Brahman. To him I shall come when I go beyond this life, and to him will come he who has faith and doubts not.\n\u2014The Upanishads, India, c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. Brahman refers to the ultimate reality of all things in the Hindu religion. In contrast, Buddhism does not have a concept of supreme God. The answer is (A).", "human_aging": "The following are multiple choice questions (with answers) about human aging.\n\nQ: All other things being equal, which of the following persons is more likely to show osteoporosis?\n(A) An older Hispanic American woman (B) An older African American woman (C) An older Asian American woman (D) An older Native American woman\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Although osteoporosis can occur at any age, the risk is higher for older people. It is most common in Asian and non-Hispanic white women. The answer is (C).\n\nQ: The finding that adults tend to remember events from their adolescence better than from other periods in their lives is referred to as the\n(A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic retrieval spike\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Reminiscence bump is a phenomenon that older adults tend to recollect events during their young ages. People usually have a period of childhood amnesia from birth to around age 5, and a reminiscence bump between 10 and 30. The answer is (B).\n\nQ: Which element in tobacco smoke is responsible for cancers?\n(A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).\n\nQ: When older adults move to a new state after retirement, which of the following is the more likely destination?\n(A) Texas (B) California (C) Hawaii (D) Vermont\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Texas does not have state tax, and has low cost of living compared with the other three options. The answer is (A).", "human_sexuality": "The following are multiple choice questions (with answers) about human sexuality.\n\nQ: The most common disorder among men who seek sexual therapy is:\n(A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory disorder\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The most common type of arousal disorder for men is erectile dysfunction, meaning that a person is not able to get any physical satisfaction from sexual activity although he may be interested in it. The answer is (C).\n\nQ: A woman who knows she has active herpes and untreated syphilis but continues to have sex without informing her partners of her condition has, in psychoanalytic terms:\n(A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. A person with weak superego tends to be delinquent, criminal or have antisocial personality. The action of the woman who knows she has active venereal disease but still have sex with her partners indicate she may has antisocial personality. The answer is (B).\n\nQ: The nature of homosexual activities that occur during preadolescence include all but which of the following?\n(A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other's genitals\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Sexual intercourse is prohibited during preadolescence. The answer is (A).\n\nQ: Women's ability to have multiple orgasms is primarily due to:\n(A) the fact that they do not have a refractory period. (B) the response of the inner layers of the vagina. (C) having alternating orgasms in different locations. (D) the G-Spot.\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The refractory period is the time when a person is not able to be erect or is not interested in sex. The answer is (A).\n\nQ: Morning sickness is typically a problem:\n(A) during the first trimester (B) during the second trimester (C) during the third trimester (D) all through the pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Morning sickness usually begins by nine weeks after conception, corresponding to the first trimester. The answer is (A).", "international_law": "The following are multiple choice questions (with answers) about international law.\n\nQ: How the consent to be bound of a State may be expressed?\n(A) The consent of a State to be bound is expressed only by ratification (B) The consent of a state to be bound by a treaty may be expressed by signature, ratification, acceptance, approval or accession (C) The consent of a State to be bound is expressed by signature (D) The consent of a State to be bound is expressed by whatever means they choose\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969 states that \"the consent of a State to be bound by a treaty may be expressed by signature, exchange of instruments constituting a treaty, ratification, acceptance, approval or accession, or by any other means if so agreed.\" (B) is the most precise and accurate answer. The answer is (B).\n\nQ: What is the judge ad hoc?\n(A) If a party to a contentious case before the ICJ does not have a national sitting as judge, it is entitled to nominate someone as a judge solely for that case, with the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified or passes away (D) Judge ad hoc is the judge that each party will always nominate in every contentious case\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. As \"ad hoc\" implies, a judge ad hoc is appointed only for a specific case or period, when a party to a contentious case before the International Court of Justice does not have a regular national sitting as judge. The answer is (A).\n\nQ: When 'consent' can serve as a circumstance precluding the wrongfulness of a State conduct?\n(A) Consent can serve as a circumstance precluding the wrongfulness whenever it is given (B) Consent can never serve as a circumstance precluding wrongfulness (C) Consent can serve as a circumstance precluding wrongfulness, provided the consent is valid and to the extent that the conduct remains within the limits of the consent given (D) Consent can always serve as a circumstance precluding wrongfulness, no matter which organ of the State gives it\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Valid consent can serve as a circumstance precluding the wrongfulness of a State conduct if the conduct remains within the limits of that consent, according to Chapter V of the Responsibility of States for Internationally Wrongful Acts, 2001, United Nations. The answer is (C).\n\nQ: Would a reservation to the definition of torture in the ICCPR be acceptable in contemporary practice?\n(A) This is an acceptable reservation if the reserving country's legislation employs a different definition (B) This is an unacceptable reservation because it contravenes the object and purpose of the ICCPR (C) This is an unacceptable reservation because the definition of torture in the ICCPR is consistent with customary international law (D) This is an acceptable reservation because under general international law States have the right to enter reservations to treaties\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable reservation in contemporary practice. The answer is (B).\n\nQ: What types of force does Article 2(4) of the UN Charter prohibit?\n(A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types of force, including sanctions (C) Article 2(4) encompasses all interference in the domestic affairs of States (D) Article 2(4) encompasses force directed only against a State's territorial integrity\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 2(4) of the UN Charter prohibits states from using armed forces in their international relations. The answer is (A).", "jurisprudence": "The following are multiple choice questions (with answers) about jurisprudence.\n\nQ: Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally fine self winding Rolox watch which we will sell to you at a very favorable price.'\n(A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C) The letter contains a valid offer which will terminate within a reasonable time. (D) The letter lacks one of the essential elements of an offer.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. An offer shows the intent to enter into a mutually-beneficial contract with specific terms. An offer can be made by a letter. While this letter indicates the willingness to sell, the lack of specific terms, such as transaction price and offer expiration date, makes it an incomplete offer. The answer is (D).\n\nQ: Functions of the law include all but which of the following?\n(A) maximizing individual freedom (B) providing a basis for compromise (C) keeping the peace (D) promoting the principles of the free enterprise system\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Laws are fundamentally about helping resolve disputes between individuals, and therefore essential for maximizing individual freedom, providing a basis for compromise, and keeping the peace. The answer is (D).\n\nQ: The ________ School of jurisprudence postulates that the law is based on what is \"correct.\"\n(A) Natural Law (B) Analytical (C) Historical (D) Sociological\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Natural Law School of jurisprudence focuses on the laws of nature, and states that the law should be based on ethics, morals, and what is \"correct\". Analytical deals with the law as it already exists, Historical postulates that the law was found and not made, and Sociological studies how the law and society impact each other. The answer is (A).\n\nQ: Which word best summarizes Weber's explanation of the development of formally rational law?\n(A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Weber explained the development of formal rationality in laws as how the modern society moved from tradition to rationality, where people decide actions based less on how they were culturally done and more on expected utilities. How rational individuals optimize efficiency of accomplishing tasks for higher rewards is a core principle of Capitalism. The answer is (D).\n\nQ: Which position does Rawls claim is the least likely to be adopted by the POP (people in the original position)?\n(A) The POP would choose equality above liberty. (B) The POP would opt for the 'maximin' strategy. (C) The POP would opt for the 'difference principle'. (D) The POP would reject the 'system of natural liberty.'\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. The POP would opt for the 'maximin' strategy, opt for the 'difference principle', and reject the 'system of natural liberty', but the POP would not choose equality above liberty, since the POP assume both equal and free citizens. The answer is (A).", "logical_fallacies": "The following are multiple choice questions (with answers) about logical fallacies.\n\nQ: When an arguer causes confusion during refutation because of real or feigned lack of an ability to engage in refutation, that arguer may have committed the fallacy of\n(A) poor sportsmanship (B) appeal to compassion (C) argument against the person (D) ignorance of refutation\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Ignorance of refutation, one of Aristotle's original list of logical fallacies in his Organon, is when someone causes confusion in an argument through real or feigned inability to engage in refutation, in order to win the argument. The answer is (D).\n\nQ: The complex question fallacy consists of\n(A) arguing something is inferior just because it doesn't do something it was never intended to do. (B) including more than one claim in the proposition and treating proof for one claim as proof for all the claims. (C) drawing a conclusion before examining the evidence, and only considering evidence that supports that conclusion. (D) asking a question that includes either an unproven assumption or more than one question, thus making a straightforward yes or no answer meaningless.\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The complex question fallacy is when someone makes a single yes or no answer to a question meaningless, by including either an unproven assumption or many questions. The latter is also known as the many questions fallacy. The answer is (D).\n\nQ: Arguing that what is true of the parts must be true of the whole is the fallacy of...\n(A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Fallacy of composition occurs when someone argues what is true of the parts must be true of the whole. The answer is (B).\n\nQ: Which of the following is true of a valid categorical syllogism?\n(A) The minor premise must deny the antecedent (B) The major premise must affirm the consequent (C) The middle term must be used in at least one premise in a universal or unqualified sense (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. A valid categorical syllogism must satisfy several conditions: (1) the syllogism must have exactly three terms (2) every term of the syllogism must be used twice exactly, (3) a term may be used only once in any premise, and (4) the middle term must be used in at least one premise in a universal or unqualified sense, etc. Only (C) is true. The answer is (C).\n\nQ: If someone attacks the character of an opposing arguer, instead of responding to that opponent's arguments, the first person has probably committed which of the following fallacies?\n(A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The argument against the person fallacy occurs when someone irrelevantly attacks the character of an opposing arguer, instead of addressing that opponent's arguments. The answer is (C).", "machine_learning": "The following are multiple choice questions (with answers) about machine learning.\n\nQ: Which image data augmentation is most common for natural images?\n(A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization (D) dithering\nA: Let's think step by step. Data augmentation is used to increase the diversity of images in the training dataset. It is important that natural images are kept natural after being augmented. Vertical flips of images are not natural, so (B) is false. Posterization makes the image look like a poster and and dithering increases color depth. None of these two preserve the natural property. The only natural data augmentation technique is (A). The answer is (A).\n\nQ: Traditionally, when we have a real-valued input attribute during decision-tree learning we consider a binary split according to whether the attribute is above or below some threshold. Pat suggests that instead we should just have a multiway split with one branch for each of the distinct values of the attribute. From the list below choose the single biggest problem with Pat\u2019s suggestion:\n(A) It is too computationally expensive. (B) It would probably result in a decision tree that scores badly on the training set and a testset. (C) It would probably result in a decision tree that scores well on the training set but badly on a testset. (D) It would probably result in a decision tree that scores well on a testset but badly on a training set.\nA: Let's think step by step. Because the input is real valued, it is unlikely that the same values appear both at training and test time. This means that while such a decision tree could yield good performance on the training data, when evaluated on the test data it will perform badly because the decision tree won\u2019t know what to do with numbers that did not appear in the training data. The answer is (C).\n\nQ: You are reviewing papers for the World\u2019s Fanciest Machine Learning Conference, and you see submissions with the following claims. Which ones would you consider accepting?\n(A) My method achieves a training error lower than all previous methods! (B) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise test error.) (C) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.) (D) My method achieves a cross-validation error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.)\nA: Let's think step by step. In machine learning, we train with some data and fixed hyperparameters and the training error can be arbitrarily low, so (A) can\u2019t be right. Then, one compares different hyperparameters by selecting the model with the lowest cross-validation error, this means that (B) and (D) are not the right procedure. The only relevant number after these is the test error and thus (C) is the right answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times; side 6: 5 times. Based on these results, what is the probability of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one to the different counts and then normalizes the probabilities accordingly. The counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7. The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality the IID test set must have how many examples?\n(A) around 10 examples (B) around 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA: Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with 95% probability the in-sample and out-of-sample errors differ by epsilon when we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2) log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than 1000 examples. The answer is (D).", "management": "The following are multiple choice questions (with answers) about management.\n\nQ: How can organisational structures that are characterised by democratic and inclusive styles of management be described?\n(A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Flat organizational structures are characterized by democratic and inclusive styles of management, and have few (if any) levels of management between the workers and managers.  The answer is (C).\n\nQ: Hygiene factors are associated with which writer?\n(A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Hygiene factors include compensation, company policies, supervision, interpersonal relations, and work environments. Hertzberg lists them as factors that cannot motivate employees but can minimize job dissatisfaction. The answer is (A).\n\nQ: What characteristic is not a key feature of the 'open systems' model of management?\n(A) Morale (B) Innovation (C) Growth resource (D) Adaptation\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The key characteristics of an open system in management include innovation, growth resource, and adaption, but do not include morale. The answer is (A).\n\nQ: Which element of the cultural web forms regalia?\n(A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The cultural web is a tool for mapping an organization's culture, where symbols form the regalia that visually expresses the values that the organization holds as important. The answer is (A).\n\nQ: What are the two main dimensions of the Ohio Studies into leadership?\n(A) Starting position and end position (B) Initial environment and changed environment (C) Organisational structure and conditioning (D) Initiating structure and considerations\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The Ohio State Leadership Studies conducted in the 1940s identified initiating structure and consideration as the two main dimensions of leader behavior. The answer is (D).", "marketing": "The following are multiple choice questions (with answers) about marketing.\n\nQ: Although the content and quality can be as controlled as direct mail, response rates of this medium are lower because of the lack of a personal address mechanism. This media format is known as:\n(A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Door to door marketing delivers non-addressed items within all buildings within a geographic area. While it can control the content and quality as well as direct mail marketing, its response rate is lower because of the lack of a personal address mechanism. The answer is (D).\n\nQ: In an organization, the group of people tasked with buying decisions is referred to as the _______________.\n(A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making unit.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. In an organization, the group of the people tasked with buying decision is referred to as the decision-making unit. The answer is (D).\n\nQ: The single group within society that is most vulnerable to reference group influence is:\n(A) The older consumer who feels somewhat left out of things. (B) The married women, many of whom feel a need for stability in their lives. (C) New immigrants who really want to assimilate into their new culture. (D) Children, who base most of their buying decisions on outside influences.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Children, who mostly based their buying decisions on outside influences, are the single group within society that is more vulnerable to reference group influence. The answer is (D).\n\nQ: Which of the following is an assumption in Maslow's hierarchy of needs?\n(A) Needs are dependent on culture and also on social class. (B) Lower-level needs must be at least partially satisfied before higher needs can affect behaviour. (C) Needs are not prioritized or arranged in any particular order. (D) Satisfied needs are motivators, and new needs emerge when current needs remain unmet.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Maslow's hierarchy of needs, from the bottom upwards, are physiological (food and clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level needs must be at least partially satisfied before higher ones can affect behavior. The answer is (B).\n\nQ: _____________ is a natural outcome when combining demographic and geographic variables.\n(A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Geodemographics is a natural outcome when combining demographic and geographic variables. The answer is (A).", "medical_genetics": "The following are multiple choice questions (with answers) about medical genetics.\n\nQ: The stage of meiosis in which chromosomes pair and cross over is:\n(A) prophase I (B) metaphase I (C) prophase II (D) metaphase II\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Prophase I is the stage of meiosis where homologous chromosomes pair with each other and exchange genetic material. The answer is (A).\n\nQ: DNA ligase is\n(A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription of specific genes (D) an enzyme which limits the level to which a particular nutrient reaches\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA strands together by catalyzing a phosphodiester bond. The answer is (A).\n\nQ: Which of the following conditions does not show multifactorial inheritance?\n(A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D) Marfan syndrome\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Multifactorial inheritance is when more than a single factor is responsible for causing a given trait or health problem. Genes cannot be the only factor. Marfan syndrome, on the other hand, requires only one abnormal copy of the of the Marfan gene, from one parent, to inherit the trait. The answer is (D).\n\nQ: A gene showing codominance\n(A) has both alleles independently expressed in the heterozygote (B) has one allele dominant to the other (C) has alleles tightly linked on the same chromosome (D) has alleles expressed at the same time in development\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance where the phenotype of both the parents is easily observed in the offspring. A heterozygote is an individual having two different alleles of a gene. The answer is (A).\n\nQ: Large triplet repeat expansions can be detected by:\n(A) polymerase chain reaction. (B) single strand conformational polymorphism analysis. (C) Southern blotting. (D) Western blotting.\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. A Southern blot is a method in molecular biology for detecting specific DNA sequences in a sample. Large triplet repeat expansions are usually detected with this method. The answer is (C).", "miscellaneous": "The following are multiple choice questions (with answers) about miscellaneous.\n\nQ: Which of these songs was a Top 10 hit for the rock band The Police?\n(A) 'Radio Ga-Ga' (B) 'Ob-la-di Ob-la-da' (C) 'De Do Do Do De Da Da Da' (D) 'In-a-Gadda-Da-Vida'\nA: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly. Leaving 'De Do Do Do De Da Da Da' as the only song by The Police, and also a Top 10 hit. The answer is (C).\n\nQ: What place is named in the title of the 1979 live album by rock legends Cheap Trick?\n(A) Budapest (B) Budokan (C) Bhutan (D) Britain\nA: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album. The answer is (B).\n\nQ: What is produced during photosynthesis?\n(A) hydrogen (B) nylon (C) oxygen (D) light\nA: Let's think step by step. We refer to Wikipedia for help. Photosynthesis is the process in which green plants use the green pigment chlorophyll to synthesize foods with water and carbon dioxide. Oxygen is the byproduct of this process. The answer is (C).\n\nQ: Who is the shortest man to ever win an NBA slam dunk competition?\n(A) Anthony 'Spud' Webb (B) Michael 'Air' Jordan (C) Tyrone 'Muggsy' Bogues (D) Julius 'Dr J' Erving\nA: Let's think step by step. We refer to Wikipedia for help. In 1986, Spud Webb, standing only 5'7\" became the shortest NBA player in history to win an official slam dunk contest. The answer is (A).\n\nQ: How many axles does a standard automobile have?\n(A) one (B) two (C) four (D) eight\nA: Let's think step by step. We refer to Wikipedia for help. Most cars have two axles to rotate the wheels.. The answer is (B).", "moral_disputes": "The following are multiple choice questions (with answers) about moral disputes.\n\nQ: Baron admits that the versions of the ticking bomb hypothetical she discusses are \"stunningly stupid,\" but she claims this is actually evidence of\n(A) the stupidity of most traditional philosophical examples. (B) a general lack of intelligence among people with advanced degrees. (C) the wrongness of torture. (D) the readiness on the part of many intelligent people to see torture as the best solution to deal with terrorism.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. The ticking bomb hypothetical poses a problem where many people will die to an exploding bomb, if the hypothetical terrorist does not disclose how to defuse it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent people, particularly utilitarians, see torture as justifiable to save the lives in this scenario. The answer is (D).\n\nQ: A fertilized ovum is also known as\n(A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Once a single sperm penetrates the layers of an egg to form a new cell, that cell is called a zygote. The answer is (A).\n\nQ: Pence compares six different cases of reproduction, from natural twinning to SCNT. What conclusion does he draw from this comparison?\n(A) SCNT is not a different kind of reproduction because there are no morally relevant differences between it and other permissible means of reproduction. (B) Because there is a low risk of harm for natural twinning, there will be a low risk of harm for SCNT. (C) both A and B (D) neither A nor B\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. SCNT, known as 'somatic cell nuclear transfer', is a technique in which the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated egg. This is used in farm animal cloning. Pence may compare this to other processes of reproduction, and see little moral separation. The answer is (A).\n\nQ: Which of the following is an intrinsic good, according to Singer?\n(A) being in the upper class (B) the ability to learn and solve problems (C) being more beautiful than most of the population (D) being shorter than average\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable criteria, but capabilities are intrinsically good. The answer is (B).\n\nQ: According to Metz, what is wrong with consequentialist arguments against capital punishment based on African values?\n(A) It is unclear as of yet whether or not capital punishment deters harm to the community. (B) It is unclear as of yet whether or not capital punishment deters harm to any individuals. (C) Consequentialism is not supported by African values. (D) Even though consequentialism is supported by African values, no consequentialist arguments framed in terms of African values have been offered.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy in South Africa. Metz has written the death penalty is unjustified as it treats individuals as incapable of communal relations. It is unclear that capital punishment is to the benefit of, or a deterrent of harm to the community. The answer is (A).", "moral_scenarios": "The following are multiple choice questions (with answers) about moral scenarios.\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario 2 | I made love to my newborn baby boy.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Stealing from someone is not considered an ethical response to harrassment. And having sex with a minor is egregiously unethical. The answer is (A).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario 2 | When the security guard left his post, I pocketed a rare coin.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Asking for something is not morally wrong. Stealing is morally wrong. The answer is (C).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't care about her racial background.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Physical violence is morally wrong. Being open to hiring people from different racial backgrounds is an expected moral stance. The answer is (B).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a rock at it.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Neither of these are wrong as no one is wronged by buing a pick-up truck or throwing a rock into a stream. The answer is (D).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Loving someone is not wrong. However, exposing something that someone is embarrassed about could be considered quite mean. The answer is (C).", "nutrition": "The following are multiple choice questions (with answers) about nutrition.\n\nQ: What is the first-line drug for patients with type 2 diabetes and obesity, as of 2020?\n(A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed for type 2 diabetes, as well as obesity. It works by lowering glucose production in the liver and improving the body's sensitivity to insulin. The answer is (B).\n\nQ: Which of the following statements is correct (according to knowledge in 2020)?\n(A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame K\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine. As it builds up in the blood and brain it can lead to brain damage. People with PKU should avoid foods that are converted to phenylalanine in the body, such as aspartame. The answer is (A).\n\nQ: Which of the following statements about iodine is correct, as of 2020?\n(A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are a poor source of iodine (C) The iodine content of organic milk is generally lower that the level in non-organic milk (D) UK dietary reference values recommend an increase in iodine intake in pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Organic milk usually has less iodine content than non-organic milk. The answer is (C).\n\nQ: Which of the following is the most plausible explanation for the protective effect of dietary fibre against cancer of the colon, as of 2020?\n(A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates \"silencing\" of the SLC5A8 tumour suppressor gene (C) None of these options are correct (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant defences in the colon\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Dietary fibre is inversely proportional to the risk of colorectal cancer. This is presumed because butyric acid (BA) stimulates antioxidants which help protect the colon from cancerous tumors. The answer is (D).\n\nQ: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which one statement is correct (according to knowledge in 2020)?\n(A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers have higher risk of developing diabetes. (C) Based on the information given we cannot tell if the observed difference in disease risk is the result of chance. (D) The risk ratio is close to the value one, so there is no difference in disease risk between the two groups.\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. The risk ratio is not sufficiently reduced that it could not be explained by random chance given the studies sample size. The answer is (C).", "philosophy": "The following are multiple choice questions (with answers) about philosophy.\n\nQ: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n(A) metaphysics (B) epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics studies the nature of reality and existence. The answer is (A).\n\nQ: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good. (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C) nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Before his conversion, Tolstoy feels that life was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ: According to d'Holbach, people always act according to _____.\n(A) free choices (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. d'Holbach believes that people act according to necessary laws, and it proves nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism is:\n(A) an ethical theory about how we ought to behave. (B) a generalization concerning the way people tend to behave. (C) a claim about human nature and the ways people are capable of behaving. (D) none of the above.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Psychological egoism suggests that one behaves based on what makes one feels good, hence it is a claim about human nature and how humans are capable of behaving. The answer is (C).", "prehistory": "The following are multiple choice questions (with answers) about prehistory.\n\nQ: What is the approximate mean cranial capacity of Homo erectus?\n(A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer is (C).\n\nQ: According to Timothy Pauketat, the evidence for social stratification and political power at Cahokia suggests:\n(A) a center of Mississippian civilization with conditions similar to the rise of early states. (B) the limitations of authority in a Native American society of egalitarian foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D. 1500. (D) a center of Mississippian civilization with conditions similar to societies on the Northwest Coast of North America.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian culture, where he found similar conditions to the rise of early states. The answer is (A).\n\nQ: Recent research on hominid species dating from the Middle Pliocene indicates there was (as of 2020):\n(A) a great amount of species diversity, or a single species that exhibited a lot of diversity. (B) very little species diversity during this period and very few hominids. (C) decreased species diversity due to a prolonged ice age followed by a severe drought. (D) decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Recent research has recognized multiple hominid species from the Middle Pliocene, meaning that there is a great amount of species diversity or diversity in a single species. The answer is (A).\n\nQ: Researchers now believe that the decline of the Maya was caused chiefly by:\n(A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological degradation resulting from slash-and-burn farming techniques. (C) endless wars between neighboring Mayan city-states. (D) practices of interbreeding that led to a steep rise in congenital disorders.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Researchers believe that the Maya collapse was mainly caused by over-exploitation of natural resources like the slash-and-burn farming techniques. The answer is (B).\n\nQ: The great Mayan king Pacal built temples in the city of Palenque in order to:\n(A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to the common people, since they were allowed to live in the temples. (C) frighten away enemies, in particular the Spaniards. (D) legitimize his kingship, since his father was not royal.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Pacal built the temples as the funerary monument to legitimize his kingship. The answer is (D).", "professional_accounting": "The following are multiple choice questions (with answers) about professional accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019s subledger. Which of the following management assertions is supported by this test?\n(A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The completeness assertion is tested by tracing supporting documents to the record entries. The answer is (B).\n\nQ: One hundred years ago, your great-great-grandmother invested $100 at 5% yearly interest. What is the investment worth today?\n(A) $13,000 (B) $600 (C) $15,000 (D) $28,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. A $100 investment at 5% yearly interest is worth 100*(1.05)^100=13150 after 100 years, which is around $13,000. The answer is (A).\n\nQ: On January 1, year 1, Alpha Co. signed an annual maintenance agreement with a software provider for $15,000 and the maintenance period begins on March 1, year 1. Alpha also incurred $5,000 of costs on January 1, year 1, related to software modification requests that will increase the functionality of the software. Alpha depreciates and amortizes its computer and software assets over five years using the straight-line method. What amount is the total expense that Alpha should recognize related to the maintenance agreement and the software modifications for the year ended December 31, year 1?\n(A) $5,000 (B) $13,500 (C) $16,000 (D) $20,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The maintenance period begins on March 1, so only 10 months of expenses should be recognized, which is $15,000/12*10=$12,500. The software modification cost is amortized over 5 years, so each year is $5,000/5=$1,000. So the total expense is $12,500+$1,000=$13,500. The answer is (B).\n\nQ: Krete is an unmarried taxpayer with income exclusively from wages. By December 31, year 1, Krete's employer has withheld $16,000 in federal income taxes and Krete has made no estimated tax payments. On April 15, year 2, Krete timely filed for an extension request to file her individual tax return, and paid $300 of additional taxes. Krete's year 1 tax liability was $16,500 when she timely filed her return on April 30, year 2, and paid the remaining tax liability balance. What amount would be subject to the penalty for underpayment of estimated taxes?\n(A) $0 (B) $500 (C) $1,650 (D) $16,500\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The tax due after withholding is $16,500-$16,000=$500, which is less than $1000, hence there is no underpayment penalty of estimated taxes. The answer is (A).\n\nQ: Box a nongovernmental not-for-profit organization had the following transactions during the year: Proceeds from sale of investments $80000 Purchase of property plant and equipment $10000 Proceeds from long-term debt $100000 Loss on sale of investment $5000 What amount should be reported as net cash provided by financing activities in Box's statement of cash flows?\n(A) $70,000 (B) $75,000 (C) $80,000 (D) 100000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. Among the four transactions, only Proceeds from long-term debt belongs to the financing activities section of cashflow, hence the amount reported should be $100000. The answer is (D).", "professional_law": "The following are multiple choice questions (with answers) about professional law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor and told him that he wanted to pay the son's debt. The father signed a document that stated the father would pay the son's debt at a rate of $500 a month for 10 months. The creditor made no written or oral commitment to forbear to sue the son to collect the $5,000 debt, and the father made no oral or written request for any such forbearance. For the next five months, the father made and the creditor accepted the $500 monthly payments as agreed. During that period, the creditor, in fact, did forbear to take any legal action against the son. However, the father then informed the creditor that he would make no further payments on the debt. Which of the following is the most persuasive argument that the father is liable to the creditor under the terms of their agreement?\n(A) The father's promise and the creditor's reliance thereon, if proved, gave rise to a valid claim by the creditor against the father based on the doctrine of promissory estoppel. (B) Because it was foreseeable that the father's promise would induce the creditor to forbear taking any action against the son, such forbearance was, as a matter of law, a bargained-for consideration for the father's promise. (C) The father's five payments to the creditor totaling $2,500 manifested a serious intent on the father's part to be contractually bound, and such manifestation is generally recognized as an effective substitute for consideration. (D) By assuming the antecedent debt obligation that the son owed to the creditor, the father became a surety whose promise to the creditor was enforceable, since it was in writing and supported by adequate consideration. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The doctrine of promissory estoppel stops a person from going back on a promise in contract law, hence option (A) should be the most persuasive argument. The answer is (A).\n\nQ: A state has recently enacted a statute prohibiting the disposal of any nuclear wastes within the state. This law does not contravene or conflict with any federal statutes. A man operates a company in the state that is engaged in the disposal of nuclear wastes. Subsequent to the passage of the state statute, the man, not yet aware of the new law, entered into contracts with many out-of-state firms to dispose of their nuclear wastes in the state. On account of this new law, however, the man will be unable to perform these contracts. Assume that the man has standing to challenge this state law. Which of the following presents his strongest constitutional grounds to challenge the state law prohibiting the disposal of nuclear wastes within the state?\n(A) The commerce clause. (B) The equal protection clause of the Fourteenth Amendment. (C) The privileges and immunities clause of Article IV, Section 2. (D) The contract clause.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The commerce clause states that Congress shall have the power to regulate commerce with foreign Nations, and among the several States, and with the Indian Tribes. The statute affects inter-state commerce which puts it into question. Hence the man's strongest argument should be the commerce clause. The answer is (A).\n\nQ: On October 1, 1980, a developer, owner of several hundred acres in a rural county, drafted a general development plan for the area. The duly recorded plan imposed elaborate limitations and restrictions upon the land in the plan, which was to be developed as a residential district. The restrictions were to extend to all persons acquiring any of the lots and to their heirs, assigns, and lessees. It was further provided that all subsequent owners would be charged with due notice of the restrictions. Among those restrictions in the general plan were the following:(22) A franchise right is created in a strip of land 10 feet in width along the rear of each lot for the use of public utility companies with right of ingress and egress. (23) No house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. In 2000, a retiree purchased one of the lots, built a house, and erected a fence in the rear of his property within the restricted area. In 2004, a teacher purchased a lot adjacent to the retiree's property and built a new house. Two years later, a librarian purchased the lot that adjoined the teacher's property. The three deeds to those properties each contained references to the deed book where the general plan was recorded. In 2008, the librarian began the construction of a seven-foot post-and-rail fence along the line dividing his lot with the teacher's, and along the center of the area subject to the franchise right. Although the teacher objected to its construction, the fence was completed. If the teacher seeks a mandatory injunction to compel removal of the librarian's fence, the court will most likely\n(A) grant relief, because the fence was in violation of the easement restriction. (B) grant relief, because the encroachment of the fence violated the restriction in the original plan. (C) deny relief, because the teacher failed to enforce the restriction against the retiree. (D) deny relief, because the fence would not be construed as \"a structure\" within the terms of the restriction. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The restrictions in the original plan say no house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. Hence the court will most likely grant relief because the fence violated the restriction in the original plan. The answer is (B).\n\nQ: Judge took judicial notice of some facts at the beginning of the trial. Which of the following is not an appropriate kind of fact for judicial notice?\n(A) Indisputable facts. (B) Facts that have been asserted by individual political organizations. (C) Facts recognized to be true by common knowledge. (D) Facts capable of scientific verification.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. Among the options, facts that have been asserted by individual political organizations is not an appropriate kind of fact for judicial notice. The answer is (B).\n\nQ: A state legislature has recently enacted a statute making it a misdemeanor to curse or revile or use obscene or opprobrious language toward or in reference to a police officer perfonning his duties. A student at a state university organized a demonstration on campus to protest the war. The rally was attended by a group of 50 students who shouted anti-war messages at cars passing by. To show his contempt for the United States, the student sewed the American flag to the rear of his jeans. When a police officer saw the flag sown on the student's jeans, he approached and told him to remove the flag or he would be placed under arrest. The student became angered and shouted at the police officer, \"Listen, you bastard, I'll wear this rag anywhere I please. \" The student was subsequently placed under arrest and charged with violating the state statute. The student subsequently brings suit in state court challenging the constitutionality of the statute. The strongest constitutional argument for the student is that\n(A) the statute is void for vagueness under the Fourteenth Amendment's due process clause. (B) the statute is invalid because it violates the petitioner's freedom of speech under the First Amendment. (C) the statute is an abridgment of freedom of speech under the First Amendment because less restrictive means are available for achieving the same purpose. (D) the statute is overbroad and consequently invalid under the First and Fourteenth Amendments.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The Fourteenth Amendment further supports the First Amendment by establishing a due process clause. Hence the strongest argument should be the statute is overbroad and consequently invalid under the First and Fourteenth Amendments. The answer is (D).", "professional_medicine": "The following are multiple choice questions (with answers) about professional medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with the complaint of right-sided rib pain when he runs long distances. Physical examination reveals normal heart and lung findings and an exhalation dysfunction at ribs\u00a04-5 on the right. Which of the following muscles or muscle groups will be most useful in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Among the options, only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with a\u00a03-week\u00a0history of low back pain. He denies any recent trauma but says that he climbs in and out of his truck numerous times a day for his job. Examination of the patient in the prone position reveals a deep sacral sulcus on the left, a posterior inferior lateral angle on the right, and a lumbosacral junction that springs freely on compression. The most likely diagnosis is\n(A) left-on-left sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The deep sulcus on the left, a posterior ILA on the right, with a negative spring test suggests a right-on-right sacral torsion. All other options have a deep sulcus on the right. The answer is (D).\n\nQ: A 44-year-old man comes to the office because of a 3-day history of sore throat, nonproductive cough, runny nose, and frontal headache. He says the headache is worse in the morning and ibuprofen does provide some relief. He has not had shortness of breath. Medical history is unremarkable. He takes no medications other than the ibuprofen for pain. Vital signs are temperature 37.4\u00b0C (99.4\u00b0F), pulse 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares shows erythematous mucous membranes. Examination of the throat shows erythema and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable cervical adenopathy. Lungs are clear to auscultation. Which of the following is the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr virus (C) Mycoplasma pneumonia (D) Rhinovirus\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms, especially the headache, suggest that the most likely cause is Rhinovirus. Epstein-Barr virus will cause swollen lymph nodes but there is no palpable cervical adenopathy. Lungs are clear to auscultation suggests it's not Mycoplasma pneumonia. The answer is (D).\n\nQ: A previously healthy 32-year-old woman comes to the physician 8 months after her husband was killed in a car crash. Since that time, she has had a decreased appetite and difficulty falling asleep. She states that she is often sad and cries frequently. She has been rechecking the door lock five times before leaving her house and has to count exactly five pieces of toilet paper before she uses it. She says that she has always been a perfectionist but these urges and rituals are new. Pharmacotherapy should be targeted to which of the following neurotransmitters?\n(A) Dopamine (B) Glutamate (C) Norepinephrine (D) Serotonin\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The patient feels sad and among the options, only Dopamine and Serotonin can help increase positive emotions. Serotonin also affects digestion and metabolism, which can help the patient's decreased appetite and sleep difficulty. The answer is (D).\n\nQ: A 42-year-old man comes to the office for preoperative evaluation prior to undergoing adrenalectomy scheduled in 2 weeks. One month ago, he received care in the emergency department for pain over his right flank following a motor vehicle collision. At that time, blood pressure was 160/100 mm Hg and CT scan of the abdomen showed an incidental 10-cm left adrenal mass. Results of laboratory studies, including complete blood count, serum electrolyte concentrations, and liver function tests, were within the reference ranges. The patient otherwise had been healthy and had never been told that he had elevated blood pressure. He takes no medications. A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine and metanephrine and plasma aldosterone concentrations. The patient was referred to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature 36.6\u00b0C (97.9\u00b0F), pulse 100/min, respirations 14/min, and blood pressure 170/95 mm Hg. Physical examination discloses no significant findings. Initial preoperative preparation should include treatment with which of the following?\n(A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure indicates hypertension. Phenoxybenzamine is used to treat hypertension caused by pheochromocytoma. The answer is (D).", "professional_psychology": "The following are multiple choice questions (with answers) about professional psychology.\n\nQ: In the construction of a multiple regression equation for purposes of prediction, the optimal combination of measures is one in which the predictors\n(A) are uncorrelated with each other but are moderately correlated with the criterion (B) have low correlations with each other and low correlations with the criterion (C) are highly intercorrelated with each other and moderately correlated with the criterion (D) have low correlations with the criterion bur are moderately correlated with each other\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The basis of multiple regression is to assess the relationship between one continuous variable and a set of independent variables. So the predictors should be uncorrelated with each other but are moderately correlated with the criterion. The answer is (A).\n\nQ: There are three ways to measure the Central Tendency: the Mean, the Median and the Mode. From your knowledge about them, what is the mode?\n(A) less sensitive to extreme scores than the mean (B) more useful for skewed distributions (C) sensitive to extreme values and highly skewed distributions (D) the most frequently occurring number\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The definition of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl Jung believed that a client's transference:\n(A) is a fantasy that distracts the client from reality. (B) represents \u201cmixed feelings\u201d toward the therapist. (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Transference is a phenomenon that a person's feelings are unconsciously redirected, so it reflects the client's personal and collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980) five cultural dimensions, the United States scores at the top of the scale on:\n(A) individualism. (B) individualism and power distance. (C) power distance and masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. US scores highest on individualism among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy clients asks your advice about a good weight- reduction program. You have investigated the programs in the community and are enrolled in the one you consider the best. This program offers a $50 bonus to its patrons for each new person they bring into the program. Under these circumstances, your most appropriate response would be to\n(A) tell your client the pros and cons of each program you know about except for the one in which you are enrolled (B) recommend to your client the program in which you are enrolled and explain the $50 bonus you will receive (C) recommend to your client the program in which you are enrolled and offer to have the $50 bonus credited to your client's account in the program (D) tell your client the pros and cons of each program you know about, but do not claim the $50 bonus if your client enrolls in your program\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Based on the circumstances, you should tell your client about the pros and cons of each program, but it would be inappropriate to receive the bonus, so you should not claim the $50 bonus. The answer is (D).", "public_relations": "The following are multiple choice questions (with answers) about public relations.\n\nQ: Earth Hour was a campaign launched by which organization?\n(A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife Fund. The answer is (D).\n\nQ: In issues management, what is the most proactive approach to addressing negative or misleading information posted online about your organization?\n(A) Buy domain names that could be used by opposition groups. (B) Post anonymous comments on blogs to combat this information. (C) Prepare a news release that discredits the inaccurate information. (D) Make policy changes to address complaints highlighted on these sites.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In issues management, the most proactive approach to addressing negative or misleading information posted online is to make policy changes to address complaints highlighted on those sites. The answer is (D).\n\nQ: At which stage in the planning process would a situation analysis be carried out?\n(A) Defining the program (B) Planning the program (C) Taking action and implementing ideas (D) Evaluation of the program\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Situation analyses are typically carried out during the planning process stage of defining the program. The answer is (A).\n\nQ: Which of these statements is true of the Vatican in 2010 at the time of the accusations of child abuse cover-ups?\n(A) There was a coordinated media response. (B) Consistent messages were communicated. (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility of the Vatican was upheld.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican took those criticisms as attacks on the Catholic Church. The answer is (C).\n\nQ: What should a public relations media practitioner do if she does not know the answer to a reporter's question?\n(A) Give the reporter other information she is certain is correct. (B) Say that the information is 'off the record' and will be disseminated later. (C) Say 'I don't know' and promise to provide the information later. (D) Say 'no comment,' rather than appear uninformed.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. If a public relations media practitioner does not know the answer to a reporter's question, they should say 'I don't know' and offer to provide the information later. The answer is (C).", "security_studies": "The following are multiple choice questions (with answers) about security studies.\n\nQ: What are the frameworks of analysis within which terrorism has been considered (as of 2020)?\n(A) Competition between larger nations has resulted in some countries actively supporting terrorist groups to undermine the strength of rival states. Terrorist networks are extended patronage clubs maintained and paid for by their donor states and are conceptualised as being like state actors, to be dealt with using military force. (B) Globalization has enabled the internationalization of terrorist activities by opening up their operational space, although coordination is still managed from a geographical base. This suggests that terrorist groups are nationally structured which means that terrorism cannot be considered in terms of a war to be defeated militarily without having serious implications on the indigenous population. (C) Terrorism can be viewed as a problem to be resolved by military means (war on terrorism), by normal police techniques (terrorism as crime), or as a medical problem with underlying causes and symptoms (terrorism as disease). (D) Terrorism is viewed as a criminal problem. The criminalization of terrorism has two important implications. Firstly, it suggests that terrorism can be eradicated - terrorists can be caught and brought to trial by normal judicial proceedings thereby removing the threat from society - and secondly, it suggests that preventative crime techniques are applicable to prevent its development.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. (A) is wrong because it is not competition between larger nations that causes terrorism. \n(B) is wrong because globalization is not the cause of terrorism.\n(C) is correct because the US undertook the war on terrorism. \n(D) is wrong because preventative crime techniques will likely not end terrorism. The answer is (C).\n\nQ: Which of the following is the best lens through which to investigate the role of child soldiers?\n(A) Child soldiers are victims of combat that need re-education and rehabilitation. (B) Children and their mothers are not active subjects in warfare and are best considered as subjects in the private sphere. (C) Children are most often innocent bystanders in war and are best used as signifiers of peace. (D) Children have political subjecthood that is missed when they are considered as passive victims of warfare.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Child soliders as a political topic can be missed when they are considered passive victims of warfare. The answer is (D).\n\nQ: How can we best describe the relationship between the state-centric approach and the concept of human security?\n(A) There are such wide divisions within the human security framework regarding the nature of threats and referent objects that no widely applicable comparisons between state-centric approaches and human security can be drawn. (B) By adopting the framework of human security, the limitations of the realist state-centric approach become evident. Whilst human security defines the referent object as the person or population, state-centric approaches prioritise the security of the state, de-prioritizing the pursuit of human security. (C) The state-centric approach to security is a faction of human security, usually defined within the broad school of human security. By being state-centric this approach prioritises the individual as the referent object in security studies. (D) Both the state-centric and human-centric approaches to security are mutually exclusive and offer a sufficient analytic framework with which to understand the international security system. It is therefore the role of security analysts to determine which of these substantial concepts is correct, and which should be discarded.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Human security focuses on a person or population whereas state-centric approaches focus on the state while deprioritizing human security. The answer is (B).\n\nQ: In order to become securitized, a threat must be presented in which of these ways?\n(A) As an existential threat that requires immediate and extraordinary action, posing a threat to the survival of the state or to societal security. (B) As requiring immediate and extraordinary action by the state, threatening the survival of a referent object and therefore warranting the use of measures not normally employed in the political realm. (C) As an urgent threat to the survival of the referent object, so serious that it legitimises the employment of extraordinary action in response. (D) As an urgent threat to the survival of the audience that requires extraordinary or emergency measures.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. To be securitized, a threat must be an urgent threat to the survival of the referent object. The answer is (C).\n\nQ: What distinguishes coercive diplomacy from military force?\n(A) Compellence is another term for coercive diplomacy, but covering a narrower set of criteria; compellence covers those threats aimed at initiating adversary action. A threat to coerce a state to give up part of its territory would count as coercive diplomacy, as long as that threat proactively initiates action before reactive diplomacy is taken. (B) Coercive diplomacy constitutes the threats of limited force to induce adversary's incentive to comply with the coercer's demands. It is an influence strategy that is intended to obtain compliance: the use of force to defeat an opponent first does not count. It leaves an element of choice with the target to comply, or to continue. (C) Military force, or the threat of military force, utilises fear to achieve strategic objectives. Coercive diplomacy is differentiated from this approach, because it does not use fear as a tool for coercing an adversary. (D) Coercive diplomacy is employed to use force but to limit its effects on the international community. Coercive diplomacy is an aggressive strategy that is intended to obtain compliance through defeat. It does not leave an element of choice with the target, the target either being forced to comply or engage in conflict. It seeks to control by imposing compliance by removing any opportunity for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Coercive diplomacy uses the threat of force to induce the opponent to comply with demands. The answer is (B).", "sociology": "The following are multiple choice questions (with answers) about sociology.\n\nQ: Which of the following is not a problem associated with official statistics on strike action?\n(A) most strikes go unnoticed by employers and the mass media (B) not all industrial disputes will be reported by the employer (C) the definition of strikes excludes those that involve fewer than ten workers or last less than one day (D) it is hard to compare strikes that were measured in different ways\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Official statistics on strike action can be problematic because not all industrial disputes will be reported by employers, the definition of strikes excludes those that involves fewer than ten workers or last less than one day, and it is hard to compare strikes that were measured in different ways. Thus, (A) is not a problem associated with official statistics on strike action. The answer is (A).\n\nQ: What does Berger (1963) describe as a metaphor for social reality?\n(A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Berger describes social reality using the metaphor of a puppet theatre. The answer is (C).\n\nQ: The term 'hegemony' refers to:\n(A) the tendency for the working class not to realize their own interests (B) a dominant ideology that legitimates economic, political and cultural power (C) a form of dual consciousness based on ideology and everyday experiences (D) a mode of payment given for outstanding topiary\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Hegemony refers to a dominant ideology that legitimates economic, policital, and cultural power. The answer is (B).\n\nQ: The shift from 'civil religion' to 'common religion' means that:\n(A) the increasing bureaucracy of the state has made religion only a marginal part of our lives (B) despite the weakening of traditional authority, our everyday lives and 'common sense' remain shaped by religious beliefs and values (C) religious participation in collective worship may have declined, but people still practise their faiths in private (D) people are much more likely to discuss their religious beliefs in public, informal settings\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The shift from civil religion to common religion means that despite the weakening of traditional authority, our everyday lives and common sense remain shaped by religious beliefs and values. The answer is (B).\n\nQ: Which of the following did the post-war welfare state of 1948 not aim to provide:\n(A) free health care and education for all (B) a minimum wage (C) full employment (D) universal welfare\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The post-war welfare state of 1948 aimed to provide free healthcare and education, full employment, and universal welfare. But it did not aim to provide a minimum wage. The answer is (B).", "us_foreign_policy": "The following are multiple choice questions (with answers) about us foreign policy.\n\nQ: How did Donald Trump attack globalization in the 2016 campaign?\n(A) Globalization had made men like him too rich (B) Globalization only benefited certain American states, such as New York (C) Liberal elites had encouraged globalization, while 'ordinary Americans' lost jobs because of it (D) Globalization encouraged damaging trade wars\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. Trump attacked globalization because he believed ordinary Americans lost jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer is (C).\n\nQ: How did NSC-68 change U.S. strategy?\n(A) It globalized containment. (B) It militarized containment. (C) It called for the development of the hydrogen bomb. (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. NSC-68 outlined a variety of courses of action, including globalization of containment, militarization of contaiment, and the development of the hydrogen bomb. The answer is (D).\n\nQ: How do Defensive Realism and Offensive Realism differ in their explanation of state behaviour?\n(A) Defensive realists place greater emphasis on the role of international institutions (B) Defensive realists place less emphasis on geographical factors (C) Offensive realists give more priority to the national interest than Defensive realists. (D) Defensive realists believe states are security maximizers, while Offensive realists believe states to be power maximizers\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. While defensive realism advocates that states are security maximizers, offensive realists think of states as power maximizers. The answer is (D).\n\nQ: The realm of policy decisions concerned primarily with relations between the United States and the rest of the world is known as\n(A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international policy.\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The topic of policy decisions concerns with relations between the US and the rest of the world is known as foreign policy. The answer is (C).\n\nQ: How did the 2008 financial crisis affect America's international reputation?\n(A) It damaged support for the US model of political economy and capitalism (B) It created anger at the United States for exaggerating the crisis (C) It increased support for American global leadership under President Obama (D) It reduced global use of the US dollar\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The 2008 financial crisis damanged the international reputation of the American model of political economy and capitalism. The answer is (A).", "virology": "The following are multiple choice questions (with answers) about virology.\n\nQ: The median survival time to AIDS and death was established by following:\n(A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters (D) High-risk seronegatives\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The median survival time to AIDS and death was established as a result of the development of seroconverters. The answer is (C).\n\nQ: Which of the following is a morphological characteristic of the paramyxoviruses.\n(A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate viruses (C) Icosahedral viruses with envelope (D) Very large viruses\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside. The answer is (A).\n\nQ: The most important goal of a behavioral intervention is:\n(A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral theory (D) Sustained behavior change\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The prim goal of a behavioral intervention is to cause sustained behavior change. The answer is (D).\n\nQ: A key factor facilitating the application of nested case-control studies from the MACS was:\n(A) Data collection (B) Establishment of a repository of biologic specimens (C) Participant interest (D) Administration of the questionnaire by staff\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The Multicenter AIDS Cohort Study's use of nested case-control studies was facilitated by the establishment of a repository of biologic specimens. The answer is (B).\n\nQ: Why are parvoviruses a highly impactful parasite?\n(A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate in dividing cells (D) Can integrate into host chromosomes\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paroviruses are highly impactful because they do not have nucleic acid. The answer is (A).", "world_religions": "The following are multiple choice questions (with answers) about world religions.\n\nQ: How can the Upanishads be characterized?\n(A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Upanishads are the most recent part of Vedas (the oldest scriptures in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical texts. The answer is (B).\n\nQ: What is the Second Gem in Buddhism?\n(A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Second Gem in Buddhism is The Dharma. The answer is (A).\n\nQ: Which Japanese government promoted a kind of national cult based on the emperor and his associations with kami?\n(A) Honen (B) Tanaka (C) Tokugawa (D) Meiji\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The promotion of a national cult based on the emperor and his associations with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is (D).\n\nQ: In which dynasty was the \"Mandate of Heaven\" developed to legitimatize the new rulers?\n(A) Shang (B) Zhou (C) Han (D) Xia\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The \"Mandate of Heaven\" was developed as an ancient Chinese philosophical concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).\n\nQ: What is the sign of the covenant for Jewish males?\n(A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. In Judaism, the most distinctive sign of the covenant is circumcision (brit milah). The answer is (B)."}
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/gen_all_splits.py b/lm_eval/tasks/mmlu/_generate_configs.py
similarity index 76%
rename from lm_eval/tasks/mmlu/gen_all_splits.py
rename to lm_eval/tasks/mmlu/_generate_configs.py
index f6f7d96e..db0e20cd 100644
--- a/lm_eval/tasks/mmlu/gen_all_splits.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -11,7 +11,7 @@ from lm_eval import utils
 from lm_eval.logger import eval_logger
 
 SUBJECTS = [
-    # "abstract_algebra",
+    "abstract_algebra",
     "anatomy",
     "astronomy",
     "business_ethics",
@@ -73,11 +73,14 @@ SUBJECTS = [
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    # parser.add_argument("--benchmark_name", required=True)
     parser.add_argument("--base_yaml_path", required=True)
     parser.add_argument(
-        "--task_save_path", default="lm_eval/tasks/mmlu/hendrycks_test_original"
+        "--save_prefix_path", default="flan"
     )
+    parser.add_argument(
+        "--cot_prompt_path", default=None
+    )
+    parser.add_argument("--task_prefix", default="")
     return parser.parse_args()
 
 
@@ -91,16 +94,25 @@ if __name__ == "__main__":
         base_yaml = yaml.full_load(f)
     print(base_yaml)
 
+    if args.cot_prompt_path is not None:
+        import json
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
     for subject in tqdm(SUBJECTS):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject]
+        else:
+            description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
 
         yaml_dict = {
             "include": base_yaml_name,
-            "task": base_yaml["task"].strip("abstract_algebra") + "subject",
+            "task": f"mmlu_{args.task_prefix}_{subject}",
             "dataset_name": subject,
-            "description": f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n",
+            "description": description,
         }
 
-        file_save_path = args.task_save_path + f"_{subject}.yaml"
+        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
         eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
         with open(file_save_path, "w") as yaml_file:
             yaml.dump(yaml_dict, yaml_file)
diff --git a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml b/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
similarity index 100%
rename from lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
rename to lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
new file mode 100644
index 00000000..64587d54
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -0,0 +1,25 @@
+group: mmlu_flan_cot_fewshot
+dataset_path: cais/mmlu
+validation_split: validation
+fewshot_split: dev
+doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
+fewshot_delimiter: ""
+output_type: greedy_until
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(.)"
+      - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
new file mode 100644
index 00000000..5c549591
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
@@ -0,0 +1,40 @@
+dataset_name: abstract_algebra
+description: "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup\
+  \ of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True,\
+  \ True (B) False, False (C) True, False (D) False, True\nA: Let's think step by\
+  \ step. A cyclic group is a group that is generated by a single element. Hence a\
+  \ subgroup generated by a single element of a group is cyclic and Statement 1 is\
+  \ True. The answer is (C).\n\nQ: The symmetric group $S_n$ has $\nactorial{n}$ elements,\
+  \ hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of\
+  \ the ring 2Z.\n(A) 0 (B) 3 (C) 12 (D) 30\nA: Let's think step by step. A characteristic\
+  \ of a ring is R is $n$ if the statement $ka = 0$ for all $a\\in 2Z$ implies that\
+  \ $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\\in 2Z$ for some $k$.\
+  \ In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).\n\nQ: Statement\
+  \ 1| Every function from a finite set onto itself must be one to one. Statement\
+  \ 2 | Every subgroup of an abelian group is abelian.\n(A) True, True (B) False,\
+  \ False (C) True, False (D) False, True\nA: Let's think step by step. Statement\
+  \ 1 is true. Let $S$ be a finite set. If $f:S \nightarrow S$ is a onto function,\
+  \ then $|S| = |f(S)|$. If $f$ was not one to one, then for finite domain $S$ the\
+  \ image would have less than $S$ elements, a contradiction.\nStatement 2 is true.\
+  \ Let $G$ be an abelian group and $H$ be a subgroup of $G$. We need to show that\
+  \ $H$ is abelian. Let $a,b \\in H$. Then $a,b \\in G$ and $ab=ba$. Since $G$ is\
+  \ abelian, $ab=ba$. Since $H$ is a subgroup of $G$, $ab \\in H$. Therefore, $ab=ba$\
+  \ and $H$ is abelian. The answer is (A).\n\nQ: Statement 1 | If aH is an element\
+  \ of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups\
+  \ of G then HK is a subgroup of G.\n(A) True, True (B) False, False (C) True, False\
+  \ (D) False, True\nA: Let's think step by step. Statement 2 is false. Let $H$ be\
+  \ a subgroup of $S_3$ generated by the cycle $(1,2)$ and $K$ be a subgroup of $S_3$\
+  \ generated by the cycle $(1,3)$. Both $H$ and $K$ have two elements, the generators\
+  \ and the identity. However $HK$ contains cycles (1,2), (1,3) and (2,3,1), but the\
+  \ inverse of (2,3,1) is (2,1,3) and it does not belong to HK, hence HK is not a\
+  \ subgroup. The answer is (B).\n\nQ: Find all c in Z_3 such that Z_3[x]/(x^2 + c)\
+  \ is a field.\n(A) 0 (B) 1 (C) 2 (D) 3\nA: Let's think step by step. Z_3[x]/(x^2\
+  \ + c) is a field if and only if x^2 + c does not have roots in Z_3. That is x^2\
+  \ + c != 0 for every x in Z_3. If c = 0, then x^2 + c = x^2 has root 0. If c = 1\
+  \ then x^2 + c = x^2 + 1 = 0 + 1 for x = 0, 1 + 1 = 2 for x = 1 and 1 + 1 = 2 for\
+  \ x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2\
+  \ has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only\
+  \ if c = 1. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
new file mode 100644
index 00000000..28ca1c4c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
@@ -0,0 +1,57 @@
+dataset_name: anatomy
+description: "The following are multiple choice questions (with answers) about anatomy.\n\
+  \nQ: Which of the following is the body cavity that contains the pituitary gland?\n\
+  (A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem\
+  \ step by step. The pituitary gland is the major endocrine gland attached to the\
+  \ base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\
+  \nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n\
+  (A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None\
+  \ of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
+  \ for help. Let\u2019s solve this problem step by step. \nWe know the following:\
+  \ (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch\
+  \ of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve.\
+  \ (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal\
+  \ nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal\
+  \ nerve. Because all these nerves are purely sensory nerves and do not contain any\
+  \ somatic motor processes. Therefore, the answer should be none of the above, which\
+  \ is (D). The answer is (D).\n\nQ: In Angle's Class II Div 2 occlusion there is\n\
+  (A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper\
+  \ central incisors. (C) excess overjet of the upper lateral incisors. (D) excess\
+  \ overjet of the upper central incisors.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step\
+  \ by step. This is a question related to anatomy and orthodontics. Excess overjet\
+  \ is associated with Class II occlusions; therefore, we can safely eliminate (B)\
+  \ from the list, as negative overjet is often associated with Class III occlusions.\
+  \ Now, we need to determine the location of the excess overjet, and that would be\
+  \ the upper (maxillary) lateral incisors. Only (C) has the correct information.\
+  \ The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are\
+  \ separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory\
+  \ epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
+  \ for help. Let\u2019s solve this problem step by step. First, recall that the pleura\
+  \ refers to the thin layer of tissue that covers the lungs and lines the interior\
+  \ wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201C\
+  The pleura have no sensory innervation.\u201D This information is not correct. The\
+  \ pleura do have a sensory innervation.\nOption (B): \u201CThe pleura are separated\
+  \ by a 2 mm space.\u201D This information is not correct. There is a very thin \u201C\
+  potential\u201D space between the layers of the pleura; however, it is typically\
+  \ filled with serous pleural fluid. \nOption (C): \u201CThe pleura extend into the\
+  \ neck.\u201D This information is actuakky true. The cervical pleura, also known\
+  \ as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural\
+  \ cavity into the neck.\nOption (D): \u201CThe pleura are composed of respiratory\
+  \ epithelium.\u201D This information is not correct. The pleaura are composed of\
+  \ connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the\
+  \ only correct answer. The answer is (C).\n\nQ: What is the embryological origin\
+  \ of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal\
+  \ arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\n\
+  A: Let's think step by step. We refer to Wikipedia articles on anatomy for help.\
+  \ Let\u2019s solve this problem step by step. The hyoid bone, which is also known\
+  \ as the hyooid, is a a small U-shaped bone located in the anterior neck. In its\
+  \ resting position, it lies between the ase of the mandible and the third cervical\
+  \ vertebrae. We know that the second and the third pharyngeal arches give rise to\
+  \ the horns of the hyoid bone; therefore, the embryological origin of the hyoid\
+  \ bone are the second and the third pharyngeal arches\u2014this information is covered\
+  \ in the last option (D). Therefore, we conclude that (D) must be the correct answer.\
+  \ The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
new file mode 100644
index 00000000..cd50fd55
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
@@ -0,0 +1,54 @@
+dataset_name: astronomy
+description: "The following are multiple choice questions (with answers) about astronomy.\n\
+  \nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper\
+  \ belt; short period comets tend to be in the plane of the solar system just like\
+  \ the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random\
+  \ directions indicating a spherical distribution of comets called the Kuiper belt.\
+  \ (C) The asteroid belt; short period comets have orbital periods similar to asteroids\
+  \ like Vesta and are found in the plane of the solar system just like the asteroid\
+  \ belt. (D) The Oort cloud; short period comets tend to be in the plane of the solar\
+  \ system just like the Oort cloud.\nA: Let's think step by step. Most short-period\
+  \ comets come from the Kuiper belt, and we know because short period coments tend\
+  \ to be in the plane of the solar system, just like the Kuiper belt is. The answer\
+  \ is (A).\n\nQ: You are pushing a truck along a road. Would it be easier to accelerate\
+  \ this truck on Mars? Why? (Assume there is no friction)\n(A) It would be harder\
+  \ since the truck is heavier on Mars. (B) It would be easier since the truck is\
+  \ lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D)\
+  \ It would be the same no matter where you are.\nA: Let's think step by step. If\
+  \ we assume that there is no friction, the force needed to accelerate the truck\
+  \ is by Newton\u2019s second law only dependent on the mass of the truck. Hence\
+  \ (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on\
+  \ Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of\
+  \ your eye has a diameter of 5 mm and you have a telescope with an aperture of 50\
+  \ cm. How much more light can the telescope gather than your eye?\n(A) 10000 times\
+  \ more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think\
+  \ step by step. The amount of light is proportional to the aperture area $A = \\\
+  pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between\
+  \ the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2\
+  \ = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid\
+  \ belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic\
+  \ collision. (B) There was not enough material in this part of the solar nebula\
+  \ to form a planet. (C) There was too much rocky material to form a terrestrial\
+  \ planet but not enough gaseous material to form a jovian planet. (D) Resonance\
+  \ with Jupiter prevented material from collecting together to form a planet.\nA:\
+  \ Let's think step by step. The asteroid belt is a stellar disc consisting of a\
+  \ large number of asteroids between Mars and Jupiter's orbits. The asteroids in\
+  \ this belt are affected by the gravitational pull from both other asteroids and\
+  \ nearby planets. Due to the strong gravitational force of Jupiter there are resonances\
+  \ that give rise to low density regions of asteroids known as the Kirkwood gap.\
+  \ So (B) and (C) are not correct since it\u2019s not a lack of material that prevents\
+  \ a planet from being formed, and (A) is incorrect because the Kirkwood gap would\
+  \ have prevented a planet from forming in the first place, and (D) is the correct\
+  \ option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is\
+  \ covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere\
+  \ scatters more light at bluer wavelengths transmitting mostly red light. (C) Because\
+  \ Mars is covered with ancient lava flows which are red in color. (D) Because flowing\
+  \ water on Mars's surface altered the surface minerals several billion years ago.\n\
+  A: Let's think step by step. Option (B) is not correct because if the red color\
+  \ was caused by the scattering off the atmosphere, then the earth with a much thicker\
+  \ atmosphere would also look red. Options (C) and (D) are not specific enough about\
+  \ why the color of the surface would be red, while (A) is correct because it explains\
+  \ that the surface is red due to the rusted materials on the surface and the red\
+  \ color comes from the rust. So the correct option is (A). The answer is (A)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
new file mode 100644
index 00000000..60d939a8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
@@ -0,0 +1,55 @@
+dataset_name: business_ethics
+description: "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour\
+  \ by companies. The success of such campaigns have been heightened through the use\
+  \ of ___________, which allow campaigns to facilitate the company in achieving _________\
+  \ .\n(A) Buycotts, Boycotts, Blockchain technology, Charitable donations (B) Buycotts,\
+  \ Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain\
+  \ technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased\
+  \ Sales\nA: Let's think step by step. We refer to Wikipedia articles on business\
+  \ ethics for help. The sentence that best uses the possible options above is \u201C\
+  In contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies.\
+  \ The success of such campaigns have been heightened through the use of *digital\
+  \ technology*, which allow campaigns to facilitate the company in achieving *increased\
+  \ sales*.\u201D The answer is (D).\n\nQ: _______ is the direct attempt to formally\
+  \ or informally manage ethical issues or problems, through specific policies, practices\
+  \ and programmes.\n(A) Corporate social responsibility (B) Business ethics management\
+  \ (C) Sustainability (D) Environmental management\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on business ethics for help. The direct attempt\
+  \ manage ethical issues through specific policies, practices, and programs is business\
+  \ ethics management. The answer is (B).\n\nQ: Three contrasting tactics that CSO's\
+  \ can engage in to meet their aims are ________ which typically involves research\
+  \ and communication, ________, which may involve physically attacking a company's\
+  \ operations or ________, often involving some form of _______.\n(A) Non-violent\
+  \ direct action, Violent direct action, Indirect action, Boycott (B) Indirect action,\
+  \ Instrumental action, Non-violent direct action, Information campaign (C) Indirect\
+  \ action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent\
+  \ direct action, Instrumental action, Indirect action, Information campaign\nA:\
+  \ Let's think step by step. We refer to Wikipedia articles on business ethics for\
+  \ help. The sentence that best uses the possible options above is \u201CThree contrasting\
+  \ tactics that CSO's can engage in to meet their aims are *indirect action*, which\
+  \ typically involves research and communication, *violent direct action*, which\
+  \ may involve physically attacking a company's operations or *non-violent direct\
+  \ action*, often involving some form of *boycott*.\u201D The answer is (C).\n\n\
+  Q: To ensure the independence of the non-executive board members, there are a number\
+  \ of steps which can be taken, which include non-executives being drawn from _______\
+  \ the company, being appointed for a _________ time period as well as being appointed\
+  \ _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently\
+  \ (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\n\
+  A: Let's think step by step. We refer to Wikipedia articles on business ethics for\
+  \ help. The sentence that best uses the possible options above is \u201CTo ensure\
+  \ the independence of the non-executive board members, there are a number of steps\
+  \ which can be taken, which include non-executives being draw from *outside* the\
+  \ company, being appointed for a *limited* time period as well as being imported\
+  \ *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging\
+  \ in CSR there are a number of moral arguments relating to: negative _______, the\
+  \ _______that corporations possess and the ________ of business and society.\n(A)\
+  \ Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual\
+  \ dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual\
+  \ dependence\nA: Let's think step by step. We refer to Wikipedia articles on business\
+  \ ethics for help. The sentence that best uses the possible options above is \u201C\
+  Beyond the business case for engaging the CSR there are a number of moral arguments\
+  \ relating to: negative *externalities*, the *power* that corporations possess and\
+  \ the *mutual independence* of business and society. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
new file mode 100644
index 00000000..a591a7bd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,58 @@
+dataset_name: clinical_knowledge
+description: 'The following are multiple choice questions (with answers) about clinical
+  knowledge.
+
+
+  Q: Glycolysis is the name given to the pathway involving the conversion of:
+
+  (A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen
+  or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl
+  CoA.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
+  for help. Glycolysis is the name given to the pathway involving conversion of glycogen
+  or glucose to pyruvate or lactate. The answer is (C).
+
+
+  Q: What is the difference between a male and a female catheter?
+
+  (A) Male and female catheters are different colours. (B) Male catheters are longer
+  than female catheters. (C) Male catheters are bigger than female catheters. (D)
+  Female catheters are longer than male catheters.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
+  for help. The difference between a male and female catheter is that male catheters
+  tend to be longer than female catheters. The answer is (B).
+
+
+  Q: How many attempts should you make to cannulate a patient before passing the job
+  on to a senior colleague, according to the medical knowledge of 2020?
+
+  (A) 4 (B) 3 (C) 2 (D) 1
+
+  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
+  for help. According to the medical protocol as of 2020, you should make two attempts
+  to cannulate a patient before passing the job on to a more-senior practitioner.
+  The answer is (C).
+
+
+  Q: In the assessment of the hand function which of the following is true?
+
+  (A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb
+  by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied
+  by the median nerve (D) Finger abduction is mediated by the palmar interossei
+
+  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
+  for help. Of all the options, it is only true that the opposition of the thumb by
+  opponens pollicis is supplied by spinal root T1. The answer is (B).
+
+
+  Q: The energy for all forms of muscle contraction is provided by:
+
+  (A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
+  for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate),
+  which is the powerhouse of the cell. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
new file mode 100644
index 00000000..be51794a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
@@ -0,0 +1,60 @@
+dataset_name: college_biology
+description: "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\nQ: Which of the following represents an accurate statement concerning\
+  \ arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan.\
+  \ (B) They possess an open circulatory system with a dorsal heart. (C) They are\
+  \ members of a biologically unsuccessful phylum incapable of exploiting diverse\
+  \ habitats and nutrition sources. (D) They lack paired, jointed appendages.\nA:\
+  \ Let's think step by step. Peptidoglycan is known to comprise the plasma membrane\
+  \ of most bacteria, rather than the exoskeleton of arthropods, which is made of\
+  \ chitin, which rules out (A). The answer (C) is false because arthropods are a\
+  \ highly successful phylum. Likewise, arthropods have paired, jointed appendages,\
+  \ which rules out (D). The only remaining option is (B), as arthropods have an open\
+  \ circulatory system with a dorsal tubular heart. The answer is (B).\n\nQ: In a\
+  \ given population, 1 out of every 400 people has a cancer caused by a completely\
+  \ recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium,\
+  \ which of the following is the expected proportion of individuals who carry the\
+  \ b allele but are not expected to develop the cancer?\n(A) 1/400 (B) 19/400 (C)\
+  \ 20/400 (D) 38/400\nA: Let's think step by step. According to the Hardy Weinberg\
+  \ Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the\
+  \ dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$,\
+  \ and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and\
+  \ heterozygous individuals, respectively. \u200BThe frequency of the recessive allele\
+  \ (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency\
+  \ of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The\
+  \ number of heterozygous individuals is equal to the frequency of heterozygous individuals\
+  \ times the size of the population, or $0.095 * 400 = 38$. So we end up with 38/400.\
+  \ The answer is (D).\n\nQ: According to the pressure-flow model of movement of phloem\
+  \ contents, photosynthate movement from source to sink is driven by\n(A) an ATP-dependent\
+  \ pressure-flow pump (B) a water-pressure potential gradient (C) transpiration (D)\
+  \ apoplastic diffusion\nA: Let's think step by step. It is a gradient in water pressure\
+  \ that induces the movement of phloem content, which refers to answer (B). The mechanism\
+  \ of movement does not rely on metabolism, which rules out (A). Transpiration refers\
+  \ to the exhalation of water vapor through plant stomata, and is also not related,\
+  \ which rules out (C). While the apoplastic pathway is one of two main pathways\
+  \ for water transport in plants, it is not central to the pressure flow model, which\
+  \ rules out (D). The answer is (B).\n\nQ: Which of the following contain DNA sequences\
+  \ required for the segregation of chromosomes in mitosis and meiosis?\n(A) Telomeres\
+  \ (B) Centromeres (C) Nucleosomes (D) Spliceosomes\nA: Let's think step by step.\
+  \ The genetic material in Telomeres is not used, which rules out (A). Nucleosomes\
+  \ are the repeating subunit that comprises chromatin packed in a cell nucleus, and\
+  \ do not specifically refer to DNA sequences necessary for segregating chromosomes\
+  \ in cell division, which rules out (C). A spliceosome is a large ribonucleoprotein\
+  \ that removes introns from transcribed pre-mRNA rather than governing chromosome\
+  \ segregation. Centromeres are directly responsible for segregating chromosomes\
+  \ in cell division. The answer is (B).\n\nQ: The presence of homologous structures\
+  \ in two different organisms, such as the humerus in the front limb of a human and\
+  \ a bird, indicates that\n(A) the human and bird are polyphyletic species (B) a\
+  \ human's and bird's evolution is convergent (C) the human and bird belong to a\
+  \ clade (D) the human and bird developed by analogy\nA: Let's think step by step.\
+  \ Polyphyletic species are organisms that are grouped due to having similar characteristics\
+  \ but which do not have a common ancestor. This is not the case for humans and birds,\
+  \ which rules out (A). Convergent evolution refers to the indepdendent development\
+  \ of similar features in different species at different periods, which is also not\
+  \ the case for humans and birds, which rules out (B). Analogy refers to the superficial\
+  \ resemblance of structures that have different origins, which is not the case for\
+  \ the human and bird forearms, which rules out (D). Humans and birds do belong to\
+  \ the same clade - a group of organisms composed of a common ancestor. The answer\
+  \ is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
new file mode 100644
index 00000000..a02c909e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
@@ -0,0 +1,37 @@
+dataset_name: college_chemistry
+description: "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq)\
+  \ + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n\
+  (A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step\
+  \ by step. A molecule that behaves as a base accepts an H+ ion (or proton) from\
+  \ another molecule, whereas a molecule that behaves as an acid donates an H+ ion\
+  \ (or proton) to another molecule. Neither of these is the case for Cl in this reaction,\
+  \ which rules out (A) and (B). A catalyst is a substance that only accelerates a\
+  \ reaction without itself undergoing chemical change, which is not the case here.\
+  \ This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which\
+  \ they donate in the reaction to form 3 HClO2. This is the behavior of a reducing\
+  \ agent, or (D). The answer is (D).\n\nQ: Which of the following statements about\
+  \ the lanthanide elements is NOT true?\n(A) The most common oxidation state for\
+  \ the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination\
+  \ numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate\
+  \ hydrogen. (D) The atomic radii of the lanthanide elements increase across the\
+  \ period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide\
+  \ elements in fact decrease across the period from La to Lu. Options (A), (B), and\
+  \ (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\n\
+  Q: Which of the following lists the hydrides of group-14 elements in order of thermal\
+  \ stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4\
+  \ < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4\
+  \ < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14\
+  \ hydrides decreases as we move from the top of group 14 to the bottom. The order\
+  \ of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in\
+  \ order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4,\
+  \ or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR\
+  \ spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming\
+  \ the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step\
+  \ by step. The electron paramagnetic resonance spectrum will be split by two forms\
+  \ of interactions. The first is the hyperfine interaction with the 13C (nuclear\
+  \ spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will\
+  \ be further split into 4 lines by the interaction with three equivalent 1H nuclei.\
+  \ The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
new file mode 100644
index 00000000..20b398c1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
@@ -0,0 +1,189 @@
+dataset_name: college_computer_science
+description: 'The following are multiple choice questions (with answers) about college
+  computer science.
+
+
+  Q: Which of the following regular expressions is equivalent to (describes the same
+  set of strings as) (a* + b)*(c + d)?
+
+  (A) a*(c + d)+ b(c + d)
+
+  (B) a*(c + d)* + b(c + d)*
+
+  (C) a*(c + d)+ b*(c + d)
+
+  (D) (a + b)*c +(a + b)*d
+
+  A: Let''s think step by step. We know that:
+
+  1. (X* + Y)* = (X + Y)*
+
+  2. X(Y + Z)? = XY + XZ
+
+  Using equation 1 we can rewrite (a* + b)*(c + d)? as:
+
+  3. (a + b)*(c + d)?
+
+  Using equation 2 we can rewrite equation 3 as:
+
+  (a + b)*c + (a + b)*d The answer is (D).
+
+
+  Q: The Singleton design pattern is used to guarantee that only a single instance
+  of a class may be instantiated. Which of the following is (are) true of this design
+  pattern?
+
+  I. The Singleton class has a static factory method to provide its instance.
+
+  II. The Singleton class can be a subclass of another class.
+
+  III. The Singleton class has a private constructor.
+
+  (A) I only
+
+  (B) II only
+
+  (C) III only
+
+  (D) I, II, and III
+
+  A: Let''s think step by step. Statement I is a correct statement about a Singleton,
+  because a Singleton restricts instantiation to a single, static method. Statement
+  II is also correct, because there is no inherent restriction regarding the inheritance
+  of a Singleton. Statement III is also correct, because a Singletons must be instantiated
+  only once, so its constructor is made private to prevent any construction except
+  via its static factory method.
+
+  Given these facts, statements I, II, and III are all correct. The answer is (D).
+
+
+  Q: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . .
+  . , R7 and supports the following operations:
+
+  ADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)
+
+  MUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)
+
+  An operation normally takes one cycle; however, an operation takes two cycles if
+  it produces a result required by the immediately following operation in an operation
+  sequence.
+
+  Consider the expression AB + ABC + BC, where variables A, B, C are located in registers
+  R0, R1, R2. If the contents of these three registers must not be modified, what
+  is the minimum number of clock cycles required for an operation sequence that computes
+  the value of AB + ABC + BC?
+
+  (A) 5 (B) 6 (C) 7 (D) 8
+
+  A: Let''s think step by step. First, we are given that A is in R0, B is in R1, and
+  C is in R2.
+
+  Next, we can see that we must compute three multiplies (AB, BC, and ABC) and two
+  adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum
+  of five clock cycles.
+
+  Next, we can see that there is no way to avoid at least one pipeline stall when
+  computing our final answer, because to compute our final sum we must wait at least
+  one cycle for the results from the previous stage to be ready. Thus, our minimum
+  number of cycles must be 6.
+
+  We can verify that we can create a solution that requires only six cycles as follows:
+
+  compute AB: MUL R0, R1, R3
+
+  compute BC: MUL R1, R2, R4
+
+  compute ABC: MUL R3, R4, R5
+
+  compute AB + BC: ADD R3, R4, R6
+
+  STALL
+
+  compute AB + ABC + BC: ADD R5, R6, R7
+
+  So there are 6 cycles. The answer is (B).
+
+
+  Q: A compiler generates code for the following assignment statement.
+
+  G := (A + B) * C - (D + E) * F
+
+  The target machine has a single accumulator and a single-address instruction set
+  consisting of instructions load, store, add, subtract, and multiply. For the arithmetic
+  operations, the left operand is taken from the accumulator and the result appears
+  in the accumulator. The smallest possible number of instructions in the resulting
+  code is
+
+  (A) 5 (B) 6 (C) 7 (D) 9
+
+  A: Let''s think step by step. We can compute the final answer with the following
+  sequence of operations:
+
+  1. LOAD D  (accumulator = D)
+
+  2. ADD E  (accumulator = D+E)
+
+  3. MUL F  (accumulator = (D+E)*F)
+
+  4. STORE X (X = (D+E)*F)
+
+  5. LOAD A  (accumulator = A)
+
+  6. ADD B  (accumulator = A+B)
+
+  7. MUL C  (accumulator = (A+B)*C)
+
+  8. SUB X  (accumulator = (A+B)*C - (D+E)*F)
+
+  9. STORE G (G = (A+B)*C - (D+E)*F)
+
+  This sequence takes 9 instructions. The answer is (D).
+
+
+  Q: Consider a computer design in which multiple processors, each with a private
+  cache memory, share global memory using a single bus. This bus is the critical system
+  resource. Each processor can execute one instruction every 500 nanoseconds as long
+  as memory references are satisfied by its local cache. When a cache miss occurs,
+  the processor is delayed for an additional 2,000 nanoseconds. During half of this
+  additional delay, the bus is dedicated to serving the cache miss. During the other
+  half, the processor cannot continue, but the bus is free to service requests from
+  other processors. On average, each instruction requires 2 memory references. On
+  average, cache misses occur on 1 percent of references. What proportion of the capacity
+  of the bus would a single processor consume, ignoring delays due to competition
+  from other processors?
+
+  (A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27
+
+  A: Let''s think step by step. We know that each instruction requires two memory
+  references per instruction, and that there is an average cache miss rate of one
+  percent.
+
+  Thus a given processor has:
+
+  (1 cache miss / 100 references) * (2 references / instruction) =
+
+  (2 cache misses / 100 instructions), so:
+
+  misses_per_instruction = 1 cache miss / 50 instructions.
+
+  Next, we know that each instruction requires 500 nanoseconds when there is no cache
+  miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:
+
+  50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:
+
+  instructions_per_ns = 50 instructions / 27000 nanoseconds.
+
+  Now, we know that each cache miss locks the bus for half of the 2000 nanosecond
+  cache miss delay, or 1000 nanoseconds, so:
+
+  lock_ns_per_miss = 1000 nanoseconds / cache miss.
+
+  Thus we can see that on average a single processor will lock the bus for:
+
+  lock_ns_per_miss * misses_per_instruction * instructions_per_ns =
+
+  (1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions
+  / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer
+  is (B).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
new file mode 100644
index 00000000..4442f9ed
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
@@ -0,0 +1,49 @@
+dataset_name: college_mathematics
+description: "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations\
+  \ T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret\
+  \ (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST =\
+  \ T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think\
+  \ step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x)\
+  \ + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x)\
+  \ + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x)\
+  \ = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11\
+  \ (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that\
+  \ $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer\
+  \ is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must\
+  \ be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of\
+  \ A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two\
+  \ distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\n\
+  A: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence\
+  \ II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and\
+  \ 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ:\
+  \ Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n =\
+  \ 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n\
+  (A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m\
+  \ and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need\
+  \ to look for smaller $m$ in order to make $m + n$ negative. The next solution is\
+  \ $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The\
+  \ answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt\
+  \ dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt\
+  \ per liter of water is sprayed into the tank at a rate of 4 liters per minute.\
+  \ The sprayed solution is continually mixed with the salt solution in the tank,\
+  \ and the mixture flows out of the tank at a rate of 4 liters per minute. If the\
+  \ mixing is instantaneous, how many grams of salt are in the tank after 100 minutes\
+  \ have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step\
+  \ by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt\
+  \ in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably.\
+  \ We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed\
+  \ into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always\
+  \ 100 liters of liquid in the tank, containing $s$ grams of salt. So the density\
+  \ of salt in the tank is $s / 100$ grams per liter. The flow of water out of the\
+  \ tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for\
+  \ all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$,\
+  \ and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in\
+  \ \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\f\
+  rac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that,\
+  \ for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then,\
+  \ for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K\
+  \ e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100\
+  \ / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
new file mode 100644
index 00000000..8f3ae14e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
@@ -0,0 +1,52 @@
+dataset_name: college_medicine
+description: "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle\
+  \ weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
+  \ Creatine supplementation is a dietary supplement that results in body mass gain.\
+  \ The answer is (B).\n\nQ: Which of the following is not a true statement?\n(A)\
+  \ Muscle glycogen is broken down enzymatically to glucose-1-phosphate (B) Elite\
+  \ endurance runners have a high proportion of Type I fibres in their leg muscles\
+  \ (C) Liver glycogen is important in the maintenance of the blood glucose concentration\
+  \ (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s\
+  \ solve this step by step and go over each choice: \n(A) \u201CMuscle glycogen is\
+  \ broken down enzymatically to glucose-1-phosphate\u201D: This is a correct statement.\n\
+  (B) \u201CElite endurance runners have a high proportion of Type I fibres in their\
+  \ leg muscles\u201D: This is a correct statement.\n(C) \u201CLiver glycogen is important\
+  \ in the maintenance of the blood glucose concentration\u201D: This is a correct\
+  \ statement. \n(D) \u201CInsulin promotes glucose uptake by all tissues in the body\u201D\
+  : This is not a correct statement, because insulin promotes glucose uptake by the\
+  \ liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues\
+  \ in the brain and red blood cells are not affected by insulin. The answer is (D).\n\
+  \nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and\
+  \ seals the lid. The pressure is 1.70 atm, and the room temperature is 25\xB0C.\
+  \ Which two variables will both increase the pressure of the system, if all other\
+  \ variables are held constant?\n(A) Increasing temperature, increasing moles of\
+  \ gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing\
+  \ temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on medicine for help. The relevant equation\
+  \ for this is the ideal gas law: PV=nRT. To increase the pressure of the system\
+  \ (P), then either n (number of moles of the gas) or T (temperature) have to increase.\
+  \ The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder\
+  \ is found that has X-linked recessive transmission. Which of the following statements\
+  \ is likely true regarding the pedigree of this disorder?\n(A) All descendants on\
+  \ the maternal side will have the disorder. (B) Females will be approximately twice\
+  \ as affected as males in this family. (C) All daughters of an affected male will\
+  \ be affected. (D) There will be equal distribution of males and females affected.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
+  \ Let\u2019s solve this step by step. Let's recall first that females have two X\
+  \ chromosomes, while males have one X and one Y chromosome. This is an important\
+  \ fact we need to know before answering this question. \nBecause a male can only\
+  \ pass his only one X chromosome to a daughter, if he is affected by this rare genetic\
+  \ disorder, then we know for sure that he will pass this rare genetic disorder to\
+  \ all his future-born daughters. Therefore, \u201C(C): All daughters of an affected\
+  \ male will be affected\u201D is a correct statement. The answer is (C).\n\nQ: Glucose\
+  \ is transported into the muscle cell:\n(A) via protein transporters called GLUT4.\
+  \ (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic\
+  \ acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on medicine for help. Glucose (also known as the blood sugar) is the main sugar\
+  \ found in the human body. It is transported into the muscle cell via diffusion\
+  \ through protein transporters called GLUT4. The answer is (A)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
new file mode 100644
index 00000000..d500a5b8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
@@ -0,0 +1,70 @@
+dataset_name: college_physics
+description: 'The following are multiple choice questions (with answers) about college
+  physics.
+
+
+  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
+  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
+  telescope is
+
+  (A) 4 (B) 5 (C) 6 (D) 20
+
+  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
+  the focus of both lenses must be between the two lenses, and thus the focal lengths
+  of the two lenses must add up to their separation. Since the focal length of one
+  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
+  the ratio of these two focal lengths, or 4. The answer is (A).
+
+
+  Q: The muon decays with a characteristic lifetime of about 10^-6 second into an
+  electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from
+  decaying into an electron and just a single neutrino by the law of conservation
+  of
+
+  (A) charge (B) mass (C) energy and momentum (D) lepton number
+
+  A: Let''s think step by step. Lepton number must be conserved, meaning the total
+  number of leptons minus the number of antileptons. If a muon decays into an electron
+  and a single neutrino, the total lepton number would go from one to two, violating
+  lepton number conservation. The answer is (D).
+
+
+  Q: One end of a Nichrome wire of length 2L and cross-sectional area A is attached
+  to an end of another Nichrome wire of length L and cross- sectional area 2A. If
+  the free end of the longer wire is at an electric potential of 8.0 volts, and the
+  free end of the shorter wire is at an electric potential of 1.0 volt, the potential
+  at the junction of the two wires is most nearly equal to
+
+  (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V
+
+  A: Let''s think step by step. This is a simple voltage divider problem, where the
+  longer wire has a resistance four times that of the shorter end. So the voltage
+  divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0
+  V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).
+
+
+  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
+  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
+  telescope is
+
+  (A) 4 (B) 5 (C) 6 (D) 20
+
+  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
+  the focus of both lenses must be between the two lenses, and thus the focal lengths
+  of the two lenses must add up to their separation. Since the focal length of one
+  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
+  the ratio of these two focal lengths, or 4. The answer is (A).
+
+
+  Q: For which of the following thermodynamic processes is the increase in the internal
+  energy of an ideal gas equal to the heat added to the gas?
+
+  (A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic
+
+  A: Let''s think step by step. Heat added to the gas can go into the gases internal
+  energy or work done against an external force. However, if the volume of the gas
+  container is constant, no work will be done (since work is pressure times change
+  in volume). So, at constant volume, all of the heat goes into the internal energy.
+  The answer is (B).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
new file mode 100644
index 00000000..08f23169
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
@@ -0,0 +1,35 @@
+dataset_name: computer_security
+description: "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628\
+  \ bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function\
+  \ which takes an input and produces a 160-bit (20-byte) hash value, its message\
+  \ digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on\
+  \ your system \u2013 so that your system doesn\u2019t run correctly or you can no\
+  \ longer access specific data, or it may even ask for ransom in order to give your\
+  \ access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D)\
+  \ Ransom Trojan\nA: Let's think step by step. The system is asking for trojans,\
+  \ which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What\
+  \ is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish\
+  \ behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities\
+  \ so they can be fixed, rather than exploited (C) Hacking into systems run by those\
+  \ whose ethics you disagree with (D) A slang term for rapid software development,\
+  \ e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a\
+  \ process of detecting vulnerabilities in an application, system, or organization's\
+  \ infrastructure that an attacker can use to exploit an individual or organization.\
+  \ They use this process to prevent cyberattacks and security breaches by lawfully\
+  \ hacking into the systems and looking for weak points. The answer is (B).\n\nQ:\
+  \ The ____________ is anything which your search engine cannot search.\n(A) Haunted\
+  \ web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step.\
+  \ The search engine searches on the Surface Web, which is the portion of the world\
+  \ wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond\
+  \ to an internet concept. The Deep Web is the part of the World Wide Web which is\
+  \ not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n\
+  (A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a\
+  \ read outside bounds of a buffer (D) a format string attack\nA: Let's think step\
+  \ by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL\
+  \ cryptographic software library. Heartbleed resulted from improper input validation\
+  \ (due to a missing bounds check) in the implementation of the TLS heartbeat extension.\
+  \ The vulnerability was classified as a buffer over-read, a situation where more\
+  \ data can be read than should be allowed. The answer is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
new file mode 100644
index 00000000..df845ce8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
@@ -0,0 +1,32 @@
+dataset_name: conceptual_physics
+description: "\nThe following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different\
+  \ frequency (B) deflection (C) interference (D) polarization\nA: Let's think step\
+  \ by step. In a soap bubble film, the light bounces between the two soap-air interfaces\
+  \ many times, interfering with itself constructively or destructively depending\
+  \ on the width of the film. This results in different colors being visible. The\
+  \ answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission,\
+  \ the combined masses of the products after fission are\n(A) less (B) more (C) the\
+  \ same (D) zero\nA: Let's think step by step. Fission releases energy, which comes\
+  \ from the rest mass of its initial nucleus. Thus the mass of the products is less\
+  \ than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things\
+  \ that are equivalent according to the equivalence principle are\n(A) space and\
+  \ time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration.\
+  \ (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence\
+  \ principle states that gravity and acceleration are equivalent. The answer is (C).\n\
+  \nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen\
+  \ (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear\
+  \ binding energy, the mass of an atomic nucleus is less than the sum of individual\
+  \ masses of the free constituent protons and neutrons; this is known as the mass\
+  \ defect. Hydrogen has no mass defect because it has only a single nucleon, so it\
+  \ will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane\
+  \ flies slower when flying into the wind and faster with wind at its back. When\
+  \ launched at right angles to the wind a cross wind its groundspeed compared with\
+  \ flying in still air is\n(A) the same (B) greater (C) less (D) either greater or\
+  \ less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed\
+  \ in the direction of the wind is greater than it would be in the absence of wind,\
+  \ and its direction orthogonal to the wind is the same as it would be in the absence\
+  \ of the wind. The total speed, which is these two components added in quadrature,\
+  \ is thus greater than the speed in still air. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
new file mode 100644
index 00000000..33883f47
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
@@ -0,0 +1,63 @@
+dataset_name: econometrics
+description: "The following are multiple choice questions (with answers) about econometrics.\n\
+  \nQ: Suppose now that a researcher wishes to use information criteria to determine\
+  \ the optimal lag length for a VAR. 500 observations are available for the bi-variate\
+  \ VAR, and the values of the determinant of the variance-covariance matrix of residuals\
+  \ are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What\
+  \ is the optimal model order according to Akaike's information criterion?\n(A) 1\
+  \ lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on econometrics for help. Let\u2019s solve this problem step\
+  \ by step. First of all, let\u2019s recall that for a given set of data, Akaike's\
+  \ information criterion (AIC) allows us to measure how well a statistical model\
+  \ fits the data; it is an estimator of prediction error. Here in this problem we\
+  \ will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the\
+  \ values of Akaike\u2019s criterion, where ln denotes the natural log function,\
+  \ det the determinant function, k the total number of parameters in total (across\
+  \ both equations), and T the number of observations (which, in this case, is equal\
+  \ to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags,\
+  \ it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate\
+  \ the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500)\
+  \ = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) =\
+  \ ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084)\
+  \ + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36\
+  \ / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the\
+  \ information criterion, the answer should be the one with the lowest value. In\
+  \ this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following\
+  \ AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2\
+  \ + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4\
+  \ (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on econometrics for help. Let\u2019s solve this problem step by step. If we have\
+  \ a an AR(1) model with the disturbances having zero mean and unit variance, then\
+  \ the unconditional mean of y is equal to the following:\nunconditional mean of\
+  \ y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the\
+  \ intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\n\
+  unconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which\
+  \ is approximately 0.33. That means that the answer should be (D) 0.33. The answer\
+  \ is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity\
+  \ is present in a regression model but ignored?\n(A) It will be biased (B) It will\
+  \ be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be\
+  \ true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics\
+  \ for help. Heteroscedasticity refers to the condition where the variance of the\
+  \ error terms is not constant across multiple observations. If heteroscedasticity\
+  \ is present in a regression model, then the coefficient estimates in the OLS estimator\
+  \ will be not only unbiased and consistent but also inefficient. Because (A) and\
+  \ (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right\
+  \ answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose\
+  \ that a test statistic has associated with it a p-value of 0.08. Which one of the\
+  \ following statements is true?\n(i) If the size of the test were exactly 8%, we\
+  \ would be indifferent between rejecting and not rejecting the null hypothesis\n\
+  (ii) The null would be rejected if a 10% size of test were used\n(iii) The null\
+  \ would not be rejected if a 1% size of test were used\n(iv) The null would be rejected\
+  \ if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only\
+  \ (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019\
+  s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n\
+  (iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii)\
+  \ are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks\
+  \ will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially\
+  \ (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ econometrics for help. This is a formal logic problem about stationally process.\
+  \ For a stationary autoregressive process, shocks will eventually die away. The\
+  \ answer is (A)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
new file mode 100644
index 00000000..cdd31ce4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
@@ -0,0 +1,34 @@
+dataset_name: electrical_engineering
+description: "\nThe following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\nQ: A point pole has a strength of 4\u03C0 * 10^-4 weber. The force\
+  \ in newtons on a point pole of 4\u03C0 * 1.5 * 10^-4 weber placed at a distance\
+  \ of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's\
+  \ think step by step. The force between two point poles is given by m_1m_2/(mu_0\
+  \ 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in\
+  \ the question, we calculate that the force is approximately 15 N. The answer is\
+  \ (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30\
+  \ mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density\
+  \ is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's\
+  \ think step by step. The torque on a coil in a uniform magnetic field is given\
+  \ by BANI, where B is the magnetic flux density, A is the area of the coil, N is\
+  \ the number of turns, and I is the current. So we have that I = (Torque)/(BAN),\
+  \ or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch\
+  \ built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1\
+  \ (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset\
+  \ latch; in the case where S=1 and R=1, the circuit has no stable state; instead\
+  \ a race condition will be produced within the circuit, so the device will be in\
+  \ an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ:\
+  \ Two long parallel conductors carry 100 A. If the conductors are separated by 20\
+  \ mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1\
+  \ N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length\
+  \ between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r),\
+  \ where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging\
+  \ in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\n\
+  Q: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03A9\
+  \ and total number of conductors is 100. Find the total resistance\n(A) 200\u03A9\
+  \ (B) 100\u03A9 (C) 50\u03A9 (D) 10\u03A9\nA: Let's think step by step. In lap winding,\
+  \ effectively two resistors are connected in parallel, so the actual resistance\
+  \ of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50\
+  \ Ohms. The answer is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
new file mode 100644
index 00000000..a85f799c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,40 @@
+dataset_name: elementary_mathematics
+description: "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern\
+  \ shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n\
+  (A) The 10th number in the pattern will be an even number.\n(B) The number pattern\
+  \ will never have two even numbers next to each other.\n(C) The next two numbers\
+  \ in the pattern will be an even number then an odd number.\n(D) If the number pattern\
+  \ started with an odd number then the pattern would have only odd numbers in it.\n\
+  A: Let's think step by step. Choice A is incorrect because every even-numbered term\
+  \ in the pattern is odd, and 10 is an even number. Choice B is correct, because\
+  \ adding an odd number (in this case 11) to an odd number produces an even number,\
+  \ and adding an odd number to an even number produces an odd number. Thus the terms\
+  \ in the pattern will alternate between odd and even, so there will never be two\
+  \ even numbers next to each other. Choice C is incorrect because the last term in\
+  \ the example is even (54), and we know that the terms will alternate between even\
+  \ and odd. Choice D is incorrect because the terms in the pattern will alternate\
+  \ between odd and even, regardless of the value of the first term. The answer is\
+  \ (B).\n\nQ: The population of the city where Michelle was born is 145,826. What\
+  \ is the value of the 5 in the number 145,826?\n(A) 5 thousands\n(B) 5 hundreds\n\
+  (C) 5 tens\n(D) 5 ones\nA: Let's think step by step. Choice A is correct, because\
+  \ there are three digits following the 5, so\nthe 5 is in the thousands place. Thus\
+  \ the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different\
+  \ colors of paint. They have 25 cans of each color in storage. The number of cans\
+  \ of paint the store has in storage can be found using the expression below. 107\
+  \ \xD7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B)\
+  \ 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107\
+  \ x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total\
+  \ of 30 players will play basketball at a park. There will be exactly 5 players\
+  \ on each team. Which statement correctly explains how to find the number of teams\
+  \ needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n\
+  (C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\n\
+  A: Let's think step by step. We want to find the number of teams. We know that there\
+  \ are 5 players/team, and 30 players. Thus to get the number of teams we divide\
+  \ players by players/team, so 30 players / 5 players/team = 6 teams. The answer\
+  \ is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n\
+  (B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think\
+  \ step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5\
+  \ x 4). The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
new file mode 100644
index 00000000..5de7486c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
@@ -0,0 +1,57 @@
+dataset_name: formal_logic
+description: "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the\
+  \ following sentence?\nTurtles live long lives and are happy creatures, unless they\
+  \ are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022\
+  \ (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on formal logic for help. Let\u2019s solve this step by\
+  \ step. Let \u201CL\u201D denote \u201Cliving long\u201D, H \u201Cbeing happy\u201D\
+  , and \u201CI\u201D \u201Cbeing injured\u201D. Now, consider each choice:\n(A) means\
+  \ (living long AND being happy) is equivalent to (being injured). \n(B) means (living\
+  \ long AND being happy) OR (being injured). \n(C) means (living long) AND (being\
+  \ happy OR being injured). \n(D) means (living long) AND (being happy implies being\
+  \ R), but what R denotes is not clear.\nObviously, (B) is the best symbolization\
+  \ of the original sentence. The answer is (B).\n\nQ: Select the best translation\
+  \ into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector;\
+  \ l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh\
+  \ (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal\
+  \ logic for help. Let\u2019s solve this step by step. We are told that \u201CBxyx\u201D\
+  \ means \u201Cx borrows y from z\u201D. We can rewrite \u201CGeorge borrows Hector's\
+  \ lawnmower\u201D as \u201CGeorge borrows a lawnmower from Hector\u201D, which can\
+  \ then be translated into predicate logic as \u201CBglh\u201D. The answer \u201C\
+  Bglh\u201D appears in (C); therefore, (C) must be the correct answer. The answer\
+  \ is (C).\n\nQ: \nSelect the best English interpretation of the given arguments\
+  \ in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/\
+  \ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything\
+  \ is a weakling or Georgia plays volleyball. So something plays volleyball. (B)\
+  \ Marina is a dancer. No weakling is a dancer. Everything is either a weakling or\
+  \ plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some\
+  \ weaklings are not dancers. Everything is either a weakling or plays volleyball.\
+  \ So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer.\
+  \ Either everything is a weakling or Georgia plays volleyball. So something plays\
+  \ volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal\
+  \ logic for help. Let\u2019s solve this step by step. Let \u201CD\u201D denote \u201C\
+  being a dancer\u201D, \u201Cm\u201D denote \u201CMaria\u201D, \u201Cg\u201D denote\
+  \ \u201CGeorgia\u201D, \u201CW\u201D denote \u201Cweakling\u201D, \u201CA\u201D\
+  \ denote \u201Cplaying volleyball\u201D. Then, we have the following:\n1. Dm \u2192\
+  \ Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling,\
+  \ then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx\
+  \ \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball.\
+  \ So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that\
+  \ some weaklings are not dancers, but the second argument strongly states that no\
+  \ weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important\
+  \ detail about Georgia playing volleyball. Option (D) has all the details presented\
+  \ in the arguments and is the best English interpretation of the arguments. The\
+  \ answer is (D).\n\nQ: Select the best translation into predicate logic: No people\
+  \ drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx)\
+  \ (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal\
+  \ logic for help. Let\u2019s solve this step by step. Let \u201CP\u201D denote \u201C\
+  being on Mars\u201D and \u201CD\u201D denote \u201Cdriving on Mars\u201D. Then let\u2019\
+  s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200\
+  x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption\
+  \ (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do\
+  \ not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these\
+  \ options, Option (C) appears to be the best and most meaningful interpretation\
+  \ of the argument \u201CNo people drive on Mars.\u201D The answer is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
new file mode 100644
index 00000000..d5031b00
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
@@ -0,0 +1,33 @@
+dataset_name: global_facts
+description: "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today\
+  \ have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\n\
+  A: Let's think step by step. We refer to Wikipedia articles on global facts for\
+  \ help. According to data published by the World Health Organization, the nummber\
+  \ of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\n\
+  Q: As of 2019, about what percentage of Americans agree that the state is run for\
+  \ the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on global facts for help. In 2019,\
+  \ about 46% percentage of Americans agree that the state is run for the benefit\
+  \ of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage\
+  \ of Russians say it is very important to have free media in our country without\
+  \ government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on global facts for help. As of 2019,\
+  \ about 38% of Russians say it is very important to have free media in our country.\
+  \ The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and\
+  \ have ____ in Africa and the Americas.\n(A) increased, increased (B) increased,\
+  \ decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on global facts for help. As of 2015,\
+  \ since 1990 forests have increased in Europe and have decreased in Africa and the\
+  \ Americas. The answer is (B).\n\nQ: Which of the following pairs of statements\
+  \ are both true (as of 2019)?\n(A) People tend to be optimistic about their own\
+  \ future and the future of their nation or the world. (B) People tend to be optimistic\
+  \ about their own future but pessimistic about the future of their nation or the\
+  \ world. (C) People tend to be pessimistic about their own future but optimistic\
+  \ about the future of their nation or the world. (D) People tend to be pessimistic\
+  \ about their own future and the future of their nation or the world.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on global facts for help. As\
+  \ of 2019, most people tend to be optimistic about their own future but pessimistic\
+  \ about the future of their nation or the world. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
new file mode 100644
index 00000000..91295fe8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
@@ -0,0 +1,54 @@
+dataset_name: high_school_biology
+description: "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\nQ: In animal cells, which of the following represents the most\
+  \ likely pathway that a secretory protein takes as it is synthesized in a cell?\n\
+  (A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013\
+  rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013\
+  plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory\
+  \ vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory\
+  \ vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts\
+  \ at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the\
+  \ endoplasmic reticulum and moves from there to the Golgi apparatus, where it is\
+  \ modified and packaged into a vesicle. The vesicle then floats to the plasma membrane\
+  \ and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed\
+  \ a previously polar amino acid into a nonpolar amino acid. This amino acid was\
+  \ located at a site distant from the enzyme\u2019s active site. How might this mutation\
+  \ alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019\
+  s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing\
+  \ the shape of the protein (D) An amino acid change away from the active site cannot\
+  \ alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A\
+  \ change in an amino acid leads to a change in the primary structure of the protein.\
+  \ A change in the primary structure may lead to a change in the secondary and the\
+  \ tertiary structure of the protein. A change in the tertiary structure means a\
+  \ change in the shape of the protein, so (C) has to be correct. Since the change\
+  \ does not affect the active site of the enzyme, we do not expect the activity of\
+  \ the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is\
+  \ not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized\
+  \ transduction (D) Transformation\nA: Let's think step by step. The introduction\
+  \ of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique\
+  \ in molecular biology and scientific research. There are multiple ways foreign\
+  \ DNA can be introduced into cells including transformation, transduction, conjugation,\
+  \ and transfection. In contrast, (A) is not a way to form DNA: during translation\
+  \ the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous\
+  \ structures are often cited as evidence for the process of natural selection. All\
+  \ of the following are examples of homologous structures EXCEPT\n(A) the wings of\
+  \ a bird and the wings of a bat (B) the flippers of a whale and the arms of a man\
+  \ (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs\
+  \ of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200B\u200B\
+  Homologous structures are similar physical features in organisms that share a common\
+  \ ancestor \u200B\u200Bbut different functions. Comparisons (B) and (C) are clearly\
+  \ homologous because they share a common ancestor and the structures serve different\
+  \ purposes. Bat wings and birg wings are also homologous, while they are both wings,\
+  \ the forelimbs serve different purposes. Insects and dogs are very far ancestors\
+  \ since one is vertebrate while the other is invertebrate and the forelimbs serve\
+  \ the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of\
+  \ the following is not known to be involved in the control of cell division?\n(A)\
+  \ Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think\
+  \ step by step. Normal cells move through the cell cycle in a regulated way. At\
+  \ the checkpoint stage, they use information about their own internal state and\
+  \ cues from the environment around them to decide whether to proceed with cell division.\
+  \ Cues like these act by changing the activity of core cell cycle regulators inside\
+  \ the cell. The most common regulators are cyclins and cyclin-dependent kinases.\
+  \ Fibroblast cells do not play any role in cell division. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
new file mode 100644
index 00000000..ce2a26cc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,49 @@
+dataset_name: high_school_chemistry
+description: "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n\
+  (A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride\
+  \ is a compound that is derived by removing water from an acid. The chemical formula\
+  \ for water is H2O, which means that we need to determine which of these options,\
+  \ when combined with H2O, forms an acid. SO2, or Sulfur dioxide, when combined with\
+  \ H2O, makes H2SO4, or sulfuric acid. The answer is (C).\n\nQ: Which of the following\
+  \ is expected to be a polar molecule?\n(A) PCl4F (B) BF3 (C) CO2 (D) Si(CH3)4\n\
+  A: Let's think step by step. A polar molecule is one that has a slightly positive\
+  \ charge on one end of the molecule and a slightly negative charge on the other\
+  \ end. Boron trifluoride (BF3) has Boron as the center atom and three fluorine atoms\
+  \ attached to it; it is trigonal planar and symmetric, so it is nonpolar. Carbon\
+  \ Dioxide (CO2) has Carbon as the central atom with double bonds to two Oxygen atoms\
+  \ - this is also symmetrical and therefore nonpolar. The same is the case for tetramethyl\
+  \ silane (SI(CH3)4), which is a Silicon atom surrounded by four methyl groups. The\
+  \ structure of PCL4F is that Phosphorus is the central atom, attached to four chlorines\
+  \ and one fluorine atom. This is asymmetrical, and therefore has a net dipole and\
+  \ is expected to be a polar molecule. The answer is (A).\n\nQ: From the solubility\
+  \ rules, which of the following is true?\n(A) All chlorides, bromides, and iodides\
+  \ are soluble (B) All sulfates are soluble (C) All hydroxides are soluble (D) All\
+  \ ammonium-containing compounds are soluble\nA: Let's think step by step. The chlorides,\
+  \ bromides, and iodides of lead, silver, and mercury are not soluble in water. This\
+  \ rules out (A). The sulfates of lead, barium, and calcium are not soluble in water,\
+  \ which rules out (B). The hydroxides of any metal besides sodium, potassium, ammonium,\
+  \ calcium, and barium are insoluble. This rules out (C). Typically ammonium ions\
+  \ indicate a soluble ionic substance. The answer is (D).\n\nQ: A new compound is\
+  \ synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol.\
+  \ When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured\
+  \ as 3.89. What is the pKa of this acid?\n(A) 3.89 (B) 7.78 (C) 5.78 (D) 2.33\n\
+  A: Let's think step by step. Recall that $[A] = [H^{+}]$. Here, this is equal to\
+  \ $$10^{-3.89}$. Then we have $K_{a} = $\nrac{[H^{+}][A^{-}]}{[HA]} = \nrac{10^{-3.89}\
+  \ \\cdot 10^{-3.89}}{10^{-2}}. The resulting exponent is $-3.89 + (-3.89) - (-2)\
+  \ = 5.78$, therefore $K_a = 10^{-5.78}$. The $pK_a$ is the negative log of $K_a$,\
+  \ which is equal to $5.78$. The answer is (C).\n\nQ: A solution contains 2.00 mole\
+  \ of acetic acid, CH3COOH, and 1.00 mole of calcium acetate, Ca(CH3COO)2. The solution\
+  \ is able to resist the addition of a small amount of strong acid or strong base\
+  \ with only minor changes in the pH of the solution. Larger quantities of strong\
+  \ acid or strong base can cause a significant change in pH. How many moles of nitric\
+  \ acid, HNO3, may be added before the pH begins to change significantly?\n(A) 0.500\
+  \ mole (B) 1.00 mole (C) 2.00 mole (D) 3.00 mole\nA: Let's think step by step. We\
+  \ would like to compute the buffer capacity of this solution. First we write the\
+  \ equation for the ionization of the weak acid, in this case of acetic acid. $CH_{3}COOH\
+  \ (aq) + H_{2}O \nightarrow H_{3}O^{+} + CH3COO^{-}$. The conjugate base is therefore\
+  \ the acetate ion. The added strong acid, Nitric acid, will react with the conjugate\
+  \ base. Therefore the maximum amount of acid that can be added will be equal to\
+  \ the amount of acetate ion, or 2 moles. The answer is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
new file mode 100644
index 00000000..16a9f66d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,70 @@
+dataset_name: high_school_computer_science
+description: "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\nQ: Which of the following is an example of the use\
+  \ of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that\
+  \ it is about to hit an object. (B) A hiker uses a G P S watch to keep track of\
+  \ her position. (C) A refrigerator orders milk from an online delivery service when\
+  \ the milk in the refrigerator is almost gone. (D) A runner uses a watch with optical\
+  \ sensors to monitor his heart rate.\nA: Let's think step by step. The term Internet\
+  \ of Things (IoT) refers to common devices which are connected to the internet,\
+  \ enabling new functionality. Choice A is incorrect because it does not describe\
+  \ an internet connected device. In choice B, the watch is only described as having\
+  \ GPS functionality but no internet connectivity. Choice C describes a common device\
+  \ (a refrigerator) which has internet connectivity enabling new functionality (online\
+  \ ordering). Choice D does not mention internet connectivity for the watch, only\
+  \ optical sensors. The answer is (C).\n\nQ: Many Web browsers allow users to open\
+  \ anonymous windows. During a browsing session in an anonymous window, the browser\
+  \ does not record a browsing history or a list of downloaded files. When the anonymous\
+  \ window is exited, cookies created during the session are deleted. Which of the\
+  \ following statements about browsing sessions in an anonymous window is true?\n\
+  (A) The activities of a user browsing in an anonymous window will not be visible\
+  \ to people who monitor the user's network, such as the system administrator. (B)\
+  \ Items placed in a Web store's shopping cart for future purchase during the anonymous\
+  \ browsing session will not be saved on the user's computer. (C) A user will not\
+  \ be able to log in to e-mail or social media accounts during the anonymous browsing\
+  \ session. (D) A user browsing in an anonymous window will be protected from viruses\
+  \ launched from any web sites visited or files downloaded.\nA: Let's think step\
+  \ by step. Choice A is incorrect as it only describes network traffic, which an\
+  \ anonymous browser does not change. Choice B is correct as it correctly describes\
+  \ how an anonymous browser will prevent saving data on the user\u2019s computer\
+  \ after the session is ended. Choice C is incorrect because an anonymous browser\
+  \ will not prevent logging in to email or social media accounts. Choice D is incorrect\
+  \ because an anonymous browser in itself performs no virus protection. The answer\
+  \ is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial\
+  \ value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n\
+  \  DISPLAY (\"Hotel\")\n } ELSE {\n  IF (Y > 0){\n   DISPLAY (\"November\")\n  }\
+  \ ELSE {\n   DISPLAY (\"Yankee\")\n  }\n }\n}\nWhat is displayed as a result of\
+  \ running the program?\n(A) Foxtrot (B) Hotel (C) November (D) Yankee\nA: Let's\
+  \ think step by step. Because X has the value 5, the first conditional IF (X < 0)\
+  \ is false, so we move to the first ELSE clause. Because X is 5 and Y is 10, the\
+  \ second conditional IF (X > Y) is false, so we move to the following ELSE clause.\
+  \ Since Y is 10, the conditional IF (Y > 0) is true, so the command DISPLAY (\"\
+  November\") is executed. The answer is (C).\n\nQ: What is the output of \"abc\"\
+  [::-1] in Python 3?\n(A) Error (B) abc (C) cba (D) c\nA: Let's think step by step.\
+  \ We know that the slicing operator [::-1] takes all of the elements in the string\
+  \ in reverse order, so we reverse the order of the string \"abc\", resulting in\
+  \ \"cba\". The answer is (C).\n\nQ: A list of numbers has n elements, indexed from\
+  \ 1 to n. The following algorithm is intended to display the number of elements\
+  \ in the list that have a value greater than 100. The algorithm uses the variables\
+  \ count and position. Steps 3 and 4 are missing.\n Step 1: Set count to 0 and position\
+  \ to 1.\n Step 2: If the value of the element at index position is greater than\
+  \ 100, increase the value of count by 1.\n Step 3: (missing step)\n Step 4: (missing\
+  \ step)\n Step 5: Display the value of count.\nWhich of the following could be used\
+  \ to replace steps 3 and 4 so that the algorithm works as intended?\n(A) Step 3:\
+  \ Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the\
+  \ value of count is greater than 100.\n(B) Step 3: Increase the value of position\
+  \ by 1.\n  Step 4: Repeat steps 2 and 3 until the value of position is greater than\
+  \ n.\n(C) Step 3: Repeat step 2 until the value of count is greater than 100.\n\
+  \  Step 4: Increase the value of position by 1.\n(D) Step 3: Repeat step 2 until\
+  \ the value of position is greater than n.\n  Step 4: Increase the value of count\
+  \ by 1.\nA: Let's think step by step. Choice A is incorrect, because its Step 4\
+  \ has an incorrect termination condition, stopping when count is greater than 100.\
+  \ We need to stop after inspecting all elements in the list. Choice B is correct\
+  \ because it correctly increments both count and position, and correctly repeats\
+  \ these steps and terminates when all elements in the list have been inspected.\
+  \ Choice C is incorrect because it incorrectly increments the variable count until\
+  \ its value is greater than 100, regardless of the elements in the list. Choice\
+  \ D is incorrect because its step 3 does not increment the value of position, so\
+  \ it will repeat forever. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
new file mode 100644
index 00000000..0e7aafcc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
@@ -0,0 +1,168 @@
+dataset_name: high_school_european_history
+description: "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\nQ: This question refers to the following information.\n\
+  Albeit the king's Majesty justly and rightfully is and ought to be the supreme head\
+  \ of the Church of England, and so is recognized by the clergy of this realm in\
+  \ their convocations, yet nevertheless, for corroboration and confirmation thereof,\
+  \ and for increase of virtue in Christ's religion within this realm of England,\
+  \ and to repress and extirpate all errors, heresies, and other enormities and abuses\
+  \ heretofore used in the same, be it enacted, by authority of this present Parliament,\
+  \ that the king, our sovereign lord, his heirs and successors, kings of this realm,\
+  \ shall be taken, accepted, and reputed the only supreme head in earth of the Church\
+  \ of England, called Anglicans Ecclesia; and shall have and enjoy, annexed and united\
+  \ to the imperial crown of this realm, as well the title and style thereof, as all\
+  \ honors, dignities, preeminences, jurisdictions, privileges, authorities, immunities,\
+  \ profits, and commodities to the said dignity of the supreme head of the same Church\
+  \ belonging and appertaining; and that our said sovereign lord, his heirs and successors,\
+  \ kings of this realm, shall have full power and authority from time to time to\
+  \ visit, repress, redress, record, order, correct, restrain, and amend all such\
+  \ errors, heresies, abuses, offenses, contempts, and enormities, whatsoever they\
+  \ be, which by any manner of spiritual authority or jurisdiction ought or may lawfully\
+  \ be reformed, repressed, ordered, redressed, corrected, restrained, or amended,\
+  \ most to the pleasure of Almighty God, the increase of virtue in Christ's religion,\
+  \ and for the conservation of the peace, unity, and tranquility of this realm; any\
+  \ usage, foreign land, foreign authority, prescription, or any other thing or things\
+  \ to the contrary hereof notwithstanding.\nEnglish Parliament, Act of Supremacy,\
+  \ 1534\nFrom the passage, one may infer that the English Parliament wished to argue\
+  \ that the Act of Supremacy would\n(A) give the English king a new position of authority\
+  \ (B) give the position of head of the Church of England to Henry VIII alone and\
+  \ exclude his heirs (C) establish Calvinism as the one true theology in England\
+  \ (D) end various forms of corruption plaguing the Church in England\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on european history for help. The\
+  \ Act of Supremacy states that it grants authority to the king \"to repress and\
+  \ extirpate all errors, heresies, and other enormities and abuses\", referring to\
+  \ the corruption in the Church of England. The answer is (D).\n\nQ: This question\
+  \ refers to the following information.\nRead the following excerpt.\nThe revolutionary\
+  \ seed had penetrated into every country and spread more or less. It was greatly\
+  \ developed under the r\xE9gime of the military despotism of Bonaparte. His conquests\
+  \ displaced a number of laws, institutions, and customs; broke through bonds sacred\
+  \ among all nations, strong enough to resist time itself; which is more than can\
+  \ be said of certain benefits conferred by these innovators.\nThe monarchs will\
+  \ fulfil the duties imposed upon them by Him who, by entrusting them with power,\
+  \ has charged them to watch over the maintenance of justice, and the rights of all,\
+  \ to avoid the paths of error, and tread firmly in the way of truth. Placed beyond\
+  \ the passions which agitate society, it is in days of trial chiefly that they are\
+  \ called upon to despoil realities of their false appearances, and to show themselves\
+  \ as they are, fathers invested with the authority belonging by right to the heads\
+  \ of families, to prove that, in days of mourning, they know how to be just, wise,\
+  \ and therefore strong, and that they will not abandon the people whom they ought\
+  \ to govern to be the sport of factions, to error and its consequences, which must\
+  \ involve the loss of society.\nUnion between the monarchs is the basis of the policy\
+  \ which must now be followed to save society from total ruin. . . .\nLet them not\
+  \ confound concessions made to parties with the good they ought to do for their\
+  \ people, in modifying, according to their recognized needs, such branches of the\
+  \ administration as require it.\nLet them be just, but strong; beneficent, but strict.\n\
+  Let them maintain religious principles in all their purity, and not allow the faith\
+  \ to be attacked and morality interpreted according to the social contract or the\
+  \ visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene\
+  \ of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\n\
+  Which of the following was the greatest cause of the fears expressed by Metternich\
+  \ in the document above?\n(A) The ideas of personal liberty and nationalism conceived\
+  \ during the Enlightenment resulted in radical revolutions that could spread throughout\
+  \ Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions\
+  \ and shifted the European balance of power. (C) The power of monarchs had grown\
+  \ to the point where it needed to be checked by other powers within each nation\
+  \ or domination of civilians would occur. (D) The rising and falling economic cycle\
+  \ of the newly emerging capitalist economy could lead to civilian unrest that must\
+  \ be suppressed.\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ european history for help. The fears of revolution in early 19th century Europe\
+  \ expressed by Klemens von Metternich, a conservative Austrian statesman, were a\
+  \ direct result of the age of Enlightenment, a period of European history where\
+  \ the absolute power of the monarchy was challenged with ideas of individual liberty\
+  \ and nationalism, leading to the French revolution and its effects all over Europe.\
+  \ The answer is (A).\n\nQ: This question refers to the following information.\n\
+  The excerpts below are from the Navigation Acts of 1651.\n[A]fter the first day\
+  \ of December, one thousand six hundred fifty and one, and from thence forwards,\
+  \ no goods or commodities whatsoever of the growth, production or manufacture of\
+  \ Asia, Africa or America, or of any part thereof; or of any islands belonging to\
+  \ them, or which are described or laid down in the usual maps or cards of those\
+  \ places, as well of the English plantations as others, shall be imported or brought\
+  \ into this Commonwealth of England, or into Ireland, or any other lands, islands,\
+  \ plantations, or territories to this Commonwealth belonging, or in their possession,\
+  \ in any other ship or ships, vessel or vessels whatsoever, but only in such as\
+  \ do truly and without fraud belong only to the people of this Commonwealth, or\
+  \ the plantations thereof, as the proprietors or right owners thereof; and whereof\
+  \ the master and mariners are also of the people of this Commonwealth, under the\
+  \ penalty of the forfeiture and loss of all the goods that shall be imported contrary\
+  \ to this act, , , ,\n[N]o goods or commodities of the growth, production, or manufacture\
+  \ of Europe, or of any part thereof, shall after the first day of December, one\
+  \ thousand six hundred fifty and one, be imported or brought into this Commonwealth\
+  \ of England, or any other lands or territories to this Commonwealth belonging,\
+  \ or in their possession, in any ship or ships, vessel or vessels whatsoever, but\
+  \ in such as do truly and without fraud belong only to the people of this Commonwealth,\
+  \ and in no other, except only such foreign ships and vessels as do truly and properly\
+  \ belong to the people of that country or place, of which the said goods are the\
+  \ growth, production or manufacture.\nWhich of the following best describes the\
+  \ outcome of the Navigation Acts of 1651?\n(A) They served as a catalyst for the\
+  \ growth of English shipping and overseas trade, but did little to limit the prospects\
+  \ of the Dutch in the seventeenth century. (B) They brought about almost immediate\
+  \ hardships for the Dutch economy as their dominance of overseas trade quickly ended.\
+  \ (C) They were rescinded during the restoration of the Stuarts as they sought normal\
+  \ diplomatic relations with the Dutch so not as to need Parliament's financial support\
+  \ for war. (D) They led to nearly a century of recurrent war between England and\
+  \ the Netherlands, which would not end until after American independence.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on european history for help.\
+  \ The Navigation Acts of 1651 helped English shipping by restricting the ability\
+  \ of ships from other European countries, especially the Dutch, to transport goods\
+  \ from colonies in Asia and Africa into England. The answer is (A).\n\nQ: This question\
+  \ refers to the following information.\nIn Russia there was nothing going on well,\
+  \ and [Souvarine] was in despair over the news he had received. His old companions\
+  \ were all turning to the politicians; the famous Nihilists who made Europe tremble-sons\
+  \ of village priests, of the lower middle class, of tradesmen-could not rise above\
+  \ the idea of national liberation, and seemed to believe that the world would be\
+  \ delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get\
+  \ out of it with their foolery.\"\nThen, lowering his voice still more, in a few\
+  \ bitter words he described his old dream of fraternity. He had renounced his rank\
+  \ and his fortune; he had gone among workmen, only in the hope of seeing at last\
+  \ the foundation of a new society of labour in common. All the sous in his pockets\
+  \ had long gone to the urchins of the settlement; he had been as tender as a brother\
+  \ with the colliers, smiling at their suspicion, winning them over by his quiet\
+  \ workmanlike ways and his dislike of chattering. But decidedly the fusion had not\
+  \ taken place.\nHis voice changed, his eyes grew bright, he fixed them on \xE9tienne,\
+  \ directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles\
+  \ who have won the great lottery prize of a hundred thousand francs have gone off\
+  \ at once and invested it, declaring that they are going to live without doing anything!\
+  \ Yes, that is your idea, all of you French workmen; you want to unearth a treasure\
+  \ in order to devour it alone afterwards in some lazy, selfish corner. You may cry\
+  \ out as much as you like against the rich, you haven't got courage enough to give\
+  \ back to the poor the money that luck brings you. You will never be worthy of happiness\
+  \ as long as you own anything, and your hatred of the bourgeois proceeds solely\
+  \ from an angry desire to be bourgeois yourselves in their place.\"\n\xE9mile Zola,\
+  \ French writer, Germinal, 1885\nThe passage displays the direct concern for the\
+  \ welfare of the working classes that was typically a part of which movement?\n\
+  (A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on european history for help. The modern\
+  \ Communist movement aims to establish a classless society based on communal ownership\
+  \ and distribution of property and means of production, thereby especially benefiting\
+  \ the working classes. The answer is (C).\n\nQ: This question refers to the following\
+  \ information.\nThe following excerpt is from a pamphlet.\nYou will do me the justice\
+  \ to remember, that I have always strenuously supported the Right of every man to\
+  \ his own opinion, however different that opinion might be to mine. He who denies\
+  \ to another this right, makes a slave of himself to his present opinion, because\
+  \ he precludes himself the right of changing it.\nThe most formidable weapon against\
+  \ errors of every kind is Reason. I have never used any other, and I trust I never\
+  \ shall.\nThe circumstance that has now taken place in France of the total abolition\
+  \ of the whole national order of priesthood, and of everything appertaining to compulsive\
+  \ systems of religion, and compulsive articles of faith, has not only precipitated\
+  \ my intention, but rendered a work of this kind exceedingly necessary, lest in\
+  \ the general wreck of superstition, of false systems of government, and false theology,\
+  \ we lose sight of morality, of humanity, and of the theology that is true.\nI believe\
+  \ in one God, and no more; and I hope for happiness beyond this life.\nI believe\
+  \ in the equality of man; and I believe that religious duties consist in doing justice,\
+  \ loving mercy, and endeavoring to make our fellow-creatures happy.\nI do not believe\
+  \ in the creed professed by the Jewish church, by the Roman church, by the Greek\
+  \ church, by the Turkish church, by the Protestant church, nor by any church that\
+  \ I know of. My own mind is my own church.\nAll national institutions of churches,\
+  \ whether Jewish, Christian or Turkish, appear to me no other than human inventions,\
+  \ set up to terrify and enslave mankind, and monopolize power and profit.\nI do\
+  \ not mean by this declaration to condemn those who believe otherwise; they have\
+  \ the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age\
+  \ of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed\
+  \ a system of checks and balances for government to avoid abuses of power?\n(A)\
+  \ Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\n\
+  A: Let's think step by step. We refer to Wikipedia articles on european history\
+  \ for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively\
+  \ against the monoplization of power and advocated for a system of checks and balances\
+  \ in government to prevent the rise of despotism. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
new file mode 100644
index 00000000..42f6c040
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
@@ -0,0 +1,63 @@
+dataset_name: high_school_geography
+description: 'The following are multiple choice questions (with answers) about high
+  school geography.
+
+
+  Q: Which one of the following items is an example of nonmaterial culture?
+
+  (A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).
+
+  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
+  Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not
+  physical objects. The answer is (C).
+
+
+  Q: During the third stage of the demographic transition model, which of the following
+  is true?
+
+  (A) Birth rates increase and population growth rate is less rapid. (B) Birth rates
+  decline and population growth rate is less rapid. (C) Birth rates increase and population
+  growth rate increases. (D) Birth rates decrease and population growth rate increases.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
+  The demographic transition model models the five different stages of population
+  growth as a country goes through economic development, where the third stage refers
+  to a period of declining birth rates and lower population growth. The answer is
+  (B).
+
+
+  Q: The practice of hiring a foreign third-party service provider to run an operation
+  is called
+
+  (A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
+  "Offshoring" literally means to move or base some of the activities or processes
+  of a company to a foreign country. The answer is (B).
+
+
+  Q: Which of the following statements is NOT accurate regarding the services provided
+  by local governments in the United States?
+
+  (A) Duplication of efforts occurs often. (B) Social problems of the central city
+  spill over into the surrounding residential suburbs. (C) Inefficiency in providing
+  services occurs often. (D) One neighborhood''s efforts to reduce pollution are always
+  supported by neighboring communities.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
+  There may be economic, social or political reasons for two neighboring communities
+  and their local governments not agreeing to pollution reduction efforts initiated
+  by one of them. The answer is (D).
+
+
+  Q: The rate of natural increase of a population is found by subtracting the
+
+  (A) crude death rate from the crude birth date. (B) crude birth rate from the crude
+  death rate. (C) doubling time from the crude birth rate. (D) fertility rate from
+  the crude death rate.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
+  The difference between number of births and deaths gives the population increase
+  at any given time. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..8ec1c5b0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,67 @@
+dataset_name: high_school_government_and_politics
+description: 'The following are multiple choice questions (with answers) about high
+  school government and politics.
+
+
+  Q: Which of the following best states an argument made by James Madison in The Federalist
+  number 10?
+
+  (A) Honest politicians can prevent factions from developing. (B) Factions are more
+  likely to occur in large republics than in small ones. (C) The negative effects
+  of factionalism can be reduced by a republican government. (D) Free elections are
+  the people''s best defense against factionalism.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
+  for help. In the Federalist number 10, James Madison advocated for a representative
+  republican form of government to guard against factionalism. The answer is (C).
+
+
+  Q: The term "budget deficit" refers to the
+
+  (A) annual increase in federal spending on the military (B) amount of interest on
+  the national debt (C) difference between the initial budget proposals made by the
+  president and Congress (D) amount the government spends in excess of its revenues
+
+  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
+  for help. When the goverment spends more than it earns, their difference is the
+  budget deficit. The answer is (D).
+
+
+  Q: Which of the following statements about cabinet departments is FALSE?
+
+  (A) They are established by the legislative branch. (B) Their members often don''t
+  have much influence over presidential decisions. (C) They cannot all be run by leaders
+  who belong to the same political party the president does. (D) Not every federal
+  agency is a cabinet department.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
+  for help. There is no law stipulating that some cabinet department leaders have
+  to belong to a political party different from that of the president. The answer
+  is (C).
+
+
+  Q: Which of the following cases established the precedent that a defendant must
+  be informed of the right to remain silent, the right to a lawyer, and protection
+  from self-incrimination?
+
+  (A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona
+
+  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
+  for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based
+  on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant''s
+  right to an attorney and protection from self-incrimination. The answer is (D).
+
+
+  Q: Uncertainty over the limits to presidential power is caused primarily by the
+  fact that
+
+  (A) the constitutional definition of those powers is broad and unspecific (B) most
+  people agree that the Constitution places too many limits on presidential power
+  (C) the Supreme Court consistently refuses to rule on cases concerning presidential
+  powers (D) constitutional amendments have greatly increased presidential powers
+
+  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
+  for help. The US Constitution is not very specific about the powers of the president,
+  leading to uncertainty over its limits. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..f47a83e6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,64 @@
+dataset_name: high_school_macroeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school macroeconomics.
+
+
+  Q: Which of the following policies best describes supply-side fiscal policy?
+
+  (A) An increase in the money supply (B) Increased government spending (C) Lower
+  taxes on research and development of new technology (D) Higher taxes on household
+  income
+
+  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
+  help. Supply-side fiscal policy stimulates the economy by encouraging more production
+  of goods and services through reduction in taxes and deregulation. The answer is
+  (C).
+
+
+  Q: The short-run Phillips curve indicates a
+
+  (A) direct relation between unemployment and inflation (B) direct relation between
+  price and quantity demanded (C) inverse relation between price and quantity demanded
+  (D) inverse relation between unemployment and inflation
+
+  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
+  help. The short-run Phillips curve shows that whenever unemployment decreases below
+  a natural level, the inflation starts increasing, and vice-versa. The answer is
+  (D).
+
+
+  Q: Holding all else equal which of the following monetary policies would be used
+  to boost U.S. exports?
+
+  (A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government
+  securities (D) Lowering tariffs
+
+  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
+  help. Buying government securities leads to reduction in demand for US dollars from
+  foreign buyers, thereby making it cheaper and hence making US exports more attractive.
+  The answer is (C).
+
+
+  Q: A federal deficit occurs when
+
+  (A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections
+  exceed spending. (D) federal spending exceeds federal tax revenues.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
+  help. A federal deficit occurs when federal spending exceeds federal income which
+  is primarily from tax revenues. The answer is (D).
+
+
+  Q: Which of the following is not included in the U.S. GDP?
+
+  (A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel.
+  (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An
+  American pop singer performs a sold-out concert in Paris. (D) A French theatrical
+  production tours dozens of American cities.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
+  help. The economic transactions related to the performance of the American pop-singer
+  in Paris happens entirely outside the U.S. and hence is not included in the GDP
+  numbers. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
new file mode 100644
index 00000000..eb692a09
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,36 @@
+dataset_name: high_school_mathematics
+description: "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\nQ: Simplify and write the result with a rational denominator:\
+  \ $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\\
+  frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step.\
+  \ Factoring $729=3^6$ and combining the roots $\\frac{1}{2}\\frac{1}{3}\\frac{1}{2}=\\\
+  frac{1}{12}$, we get that $\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}=\\left(\\frac{1}{3^6}\\\
+  right)^{\\frac{1}{12}}=\\frac{1}{3^{\\frac{1}{2}}}=\\frac{3}{\\sqrt{3}}$ The answer\
+  \ is (D).\n\nQ: Five thousand dollars compounded annually at an $x\\%$ interest\
+  \ rate takes six years to double. At the same interest rate, how many years will\
+  \ it take $\\$300$ to grow to $\\$9600$?\n(A) 12 (B) 1 (C) 30 (D) 5\nA: Let's think\
+  \ step by step. To go from $\\$300$ to $\\$9600$, the value must go up by a factor\
+  \ of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to\
+  \ double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\
+  \nQ: Ten students take a biology test and receive the following scores: 45, 55,\
+  \ 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n\
+  (A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students\
+  \ and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85\
+  \ = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies\
+  \ directly as the square of $y$, and $y$ varies directly as the cube of $z$. If\
+  \ $x$ equals $-16$ when $z$ equals 2, what is the value of $x$ when $z$ equals $\\\
+  frac{1}{2}$?\n(A) -1 (B) 16 (C) -\\frac{1}{256} (D) \\frac{1}{16}\nA: Let's think\
+  \ step by step. We know that $x \\propto y^2$ and $y \\propto z^3$, so $x = k z^6$\
+  \ for some constant $k$. Plugging in for $x=-16$ and $z=2$, the constant value is\
+  \ $k=\\frac{x}{z^6}=\\frac{-16}{64}=-\\frac{1}{4}$. So, when $z=\\frac{1}{2}$, the\
+  \ value of $x$ is $x=kz^6=-\\frac{1}{4}\\frac{1}{2^6}=-\\frac{1}{256}$. The answer\
+  \ is (C).\n\nQ: Joe was in charge of lights for a dance. The red light blinks every\
+  \ two seconds, the yellow light every three seconds, and the blue light every five\
+  \ seconds. If we include the very beginning and very end of the dance, how many\
+  \ times during a seven minute dance will all the lights come on at the same time?\
+  \ (Assume that all three lights blink simultaneously at the very beginning of the\
+  \ dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common\
+  \ multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights\
+  \ will come on at the same time $2*7+1=15$ times. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 00000000..86c83c82
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,63 @@
+dataset_name: high_school_microeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school microeconomics.
+
+
+  Q: Which of the following is necessarily a characteristic of oligopoly?
+
+  (A) Free entry into and exit from the market (B) A few large producers (C) One producer
+  of a good with no close substitutes (D) A homogenous product
+
+  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
+  help. An oligopoly is when a market is dominated by just one or a few number of
+  sellers or producers. To get oligopoly, the market should have high barriers to
+  new entry, and the product has differentiation. The answer is (B).
+
+
+  Q: If the government subsidizes producers in a perfectly competitive market, then
+
+  (A) the demand for the product will increase (B) the demand for the product will
+  decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease
+
+  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
+  help. (A) and (B) are wrong because the demand curve does not change at all. If
+  the government subsidizes producers, the supply will increase, and thus the consumer
+  surplus also increases. The answer is (C).
+
+
+  Q: Which of the following is true of a price floor?
+
+  (A) The price floor shifts the demand curve to the left. (B) An effective floor
+  creates a shortage of the good. (C) The price floor shifts the supply curve of the
+  good to the right. (D) To be an effective floor, it must be set above the equilibrium
+  price.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
+  help. Price floor does not shift the demand or shift curve. An effective price floor
+  should be set above the equilibrium price, otherwise the market bears and the floor
+  does not have effective effect. The answer is (D).
+
+
+  Q: The concentration ratio for a monopoly is
+
+  (A) 0 (B) 5 (C) 10 (D) 100
+
+  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
+  help. The concentration ratio is calculated as the sum of market share of a specific
+  number of largest companies. Monopoly means one company or entity controls the entire
+  market, therefore, the concentration ratio is 100 percent. The answer is (D).
+
+
+  Q: In a competitive labor market for housepainters, which of the following would
+  increase the demand for housepainters?
+
+  (A) An effective minimum wage imposed on this labor market. (B) An increase in the
+  price of gallons of paint. (C) An increase in the construction of new houses. (D)
+  An increase in the price of mechanical painters so long as the output effect exceeds
+  the substitution effect.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
+  help. An increase in the construction of new houses means an increase demand of
+  in-house painting, thus increases the demand for housepainters. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
new file mode 100644
index 00000000..f21a183c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
@@ -0,0 +1,38 @@
+dataset_name: high_school_physics
+description: "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws\
+  \ a current of 2 amps. At what rate is energy being used by the microwave oven?\n\
+  (A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy\
+  \ usage is known as power; in an dissipative electrical circuit, power is given\
+  \ by voltage times current. So in our case, the power is 120 V times 2 amps, or\
+  \ 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin.\
+  \ How much work is required to move a charge, Q = +8 \xB5C, from the point (0, 4\
+  \ meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\n\
+  A: Let's think step by step. To calculate the work required to move a charge from\
+  \ one location to another in a fixed electric field, it is enough to calculate the\
+  \ potential difference between the two locations. Here, the potential only depends\
+  \ on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019\
+  s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer\
+  \ as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following\
+  \ conditions will ensure that angular momentum is conserved? I. Conservation of\
+  \ linear momentum II. Zero net external force III. Zero net external torque\n(A)\
+  \ I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think\
+  \ step by step. Torque is defined as the change in angular momentum; if there is\
+  \ zero external torque, angular momentum is conserved. The answer is (D).\n\nQ:\
+  \ A photocell of work function \u03D5 = 2eV is connected to a resistor in series.\
+  \ Light of frequency f = 1 \xD7 10^15 Hz hits a metal plate of the photocell. If\
+  \ the power of the light is P = 100 W, what is the current through the resistor?\n\
+  (A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The\
+  \ only answer above which has units of current is D, 24 A. The answer is (D).\n\n\
+  Q: A pipe full of air is closed at one end. A standing wave is produced in the pipe,\
+  \ causing the pipe to sound a note. Which of the following is a correct statement\
+  \ about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure\
+  \ is at a node, but the particle displacement is at an antinode. (B) The pressure\
+  \ is at an antinode, but the particle displacement is at a node. (C) The pressure\
+  \ and the particle displacement are both at nodes. (D) The pressure and the particle\
+  \ displacement are both at antinodes.\nA: Let's think step by step. At the closed\
+  \ end of the pipe, the particles cannot have any net displacement because the pipe\
+  \ closure stops them. So the particle displacement is at a node. This closure also\
+  \ causes the pressure to be maximal, i.e. an antinode. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
new file mode 100644
index 00000000..706db0ec
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
@@ -0,0 +1,72 @@
+dataset_name: high_school_psychology
+description: 'The following are multiple choice questions (with answers) about high
+  school psychology.
+
+
+  Q: Pascale is interested in the processing strategies children use to learn new
+  information. Pascale would best be classified as what type of psychologist?
+
+  (A) sociocultural (B) clinical (C) cognitive (D) behaviorist
+
+  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
+  Sociocultural psychologist focuses on the effect of societal factors on people.
+  Clinical psychologist focuses on people with mental issues. Cognitive psychologist
+  focuses on how people think and learn, including the processing strategies. Behaviorist
+  focuses more on the environment and experience effect on people. The answer is (C).
+
+
+  Q: According to Caplan''s model of consultee-centered case consultation, the consultant
+  is primarily interested in
+
+  (A) identifying the causes and solutions of the client''s presenting problems (B)
+  identifying and eliminating the causes of the consultee''s difficulties in handling
+  a problem (C) establishing a hierarchy of authority to enable effective decision
+  making (D) presenting a single, well-defined and unambiguous course of action for
+  the consultant to overcome skills deficits
+
+  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
+  Caplan defines two type of consultation. Client-centered case consultation aims
+  to handle client''s problems, while consultee-centered case consultation aims to
+  identify the reason of client''s difficulty to solve problems. The answer is (B).
+
+
+  Q: According to the Individuals with Disabilities Education Improvement Act, which
+  of the following must an educational agency do before it changes the educational
+  placement of a student with a disability?
+
+  (A) Give the child a trial period in the new environment (B) Notify the parents
+  in writing (C) Obtain school board approval (D) Obtain parental consent
+
+  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
+  When the decision to change the educational placement of a student with a disability
+  is made, the educational agency must notify the parents in writing on that date.
+  The answer is (B).
+
+
+  Q: While swimming in the ocean, Ivan is frightened by a dark shadow in the water
+  even before he has the chance to identify what the shadow is. The synaptic connections
+  taking place during this incident of fright are best described by which of the following?
+
+  (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are
+  sent from the thalamus to the "what" and "where" pathways. (C) Messages are sent
+  from the parasympathetic nervous system to the cerebral cortex. (D) Messages are
+  sent from the frontal lobes to the pituitary gland.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
+  Our neural system has a mechanism that can respond immediate emotional signal before
+  going to the thought center. In the Ivan''s case, messages travel directly from
+  thalamus to amygdala. The answer is (A).
+
+
+  Q: Ani believes that her attitudes and behavior play a central role in what happens
+  to her. Such a belief is likely to be associated with
+
+  (A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal
+  locus of control.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
+  People with an external locus of control believes fate and luck play an important
+  role in their lives, while people with an internal locus of control believes they
+  control their lives. The answer is (D).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
new file mode 100644
index 00000000..37e21061
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
@@ -0,0 +1,88 @@
+dataset_name: high_school_statistics
+description: 'The following are multiple choice questions (with answers) about high
+  school statistics.
+
+
+  Q: A new smartwatch is manufactured in one part of a factory, then secured for shipping
+  in another, independent part of the factory. The weight of the smartwatch has a
+  mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging
+  (box, user''s guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation
+  of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging
+  would have the following mean and standard deviation:
+
+  (A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation
+  3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard
+  deviation 6.1 grams
+
+  A: Let''s think step by step. Since the weight of the watch and the weight of the
+  packaging are independent random variables, the mean and variance of their sum is
+  equal to the sum of their individual means and variances. So the mean is 62 + 456
+  = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation
+  of 6.1 grams. The answer is (C).
+
+
+  Q: After a frost warning was issued, the owner of a large orange grove asked his
+  workers to spray all his trees with water. The water was supposed to freeze and
+  form a protective covering of ice around the orange blossom. Nevertheless, the owner
+  suspected that some trees suffered considerable damage due to the frost. To estimate
+  the proportion of trees that suffered more than 50 percent damage due to the frost,
+  he took a random sample of 100 trees from his grove. What is the response variable
+  in this experiment?
+
+  (A) The proportion of trees that suffered more than 50 percent damage due to frost.
+  (B) The number of trees affected by the frost. (C) The number of trees sampled from
+  the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage
+  or at most 50 percent damage.
+
+  A: Let''s think step by step. In this experiment, the response variable is what
+  is measured. For each tree, what is measured is whether or not it suffered more
+  than 50 percent damage due to the frost. The answer is (D).
+
+
+  Q: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and
+  var(Y) = 12. What are the expected value and variance of the random variable X +
+  Y?
+
+  (A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X +
+  Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.
+
+  A: Let''s think step by step. While means of sums of random variables add (regardless
+  of whether the variables are independent) in order to determine the variance of
+  a sum of random variables, we need to know not just their individual variances but
+  the covariance of the two variables, which is not given in this problem. The answer
+  is (D).
+
+
+  Q: Which of the following sets has the smallest standard deviation? Which has the
+  largest?
+
+  I: {1,2,3}
+
+  II: {-10,10}
+
+  III: {100}
+
+  (A) I, II (B) II, III (C) III, I (D) III, II
+
+  A: Let''s think step by step. The variance of distribution I is the expected squared
+  deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution
+  II is 10^2 (because both elements are 10 away from the mean of zero). The variance
+  of distribution III is 0, since it has a single entry. So distribution III has the
+  smallest standard deviation and distribution II has the largest. The answer is (D).
+
+
+  Q: Which of the following is a correct statement about correlation?
+
+  (A) If the slope of the regression line is exactly 1, then the correlation is exactly
+  1. (B) If the correlation is 0, then the slope of the regression line is undefined.
+  (C) Switching which variable is called x and which is called y changes the sign
+  of the correlation. (D) The correlation r is equal to the slope of the regression
+  line when z-scores for the y-variable are plotted against z-scores for the x-variable.
+
+  A: Let''s think step by step. Statement A is false because the slope of the regression
+  line being exactly 1 can occur even when the two variables are not perfectly correlated.
+  Statement B is false because uncorrelated variables regression lines can have slope
+  zero. Statement C is false because correlation is symmetric in the two random variables.
+  The answer is (D).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
new file mode 100644
index 00000000..951666d1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
@@ -0,0 +1,133 @@
+dataset_name: high_school_us_history
+description: "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\nQ: This question refers to the following information.\n\
+  I come not to urge personal claims, nor to seek individual benefits; I appear as\
+  \ the advocate of those who cannot plead their own cause; I come as the friend of\
+  \ those who are deserted, oppressed, and desolate. In the Providence of God, I am\
+  \ the voice of the maniac whose piercing cries from the dreary dungeons of your\
+  \ jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed\
+  \ beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses.\
+  \ I am the Revelation of hundreds of wailing, suffering creatures, hidden in your\
+  \ private dwellings, and in pens and cabins\u2014shut out, cut off from all healing\
+  \ influences, from all mind-restoring cares.\u2026 Could their melancholy histories\
+  \ be spread before you as revealed to my grieved spirit during the last three months,\
+  \ how promptly, how earnestly would you search out the most approved means of relief;\
+  \ how trifling, how insignificant, by comparison, would appear the sacrifices you\
+  \ are asked to make; how would a few dimes and dollars, gathered from each citizen,\
+  \ diminish in value as a possession, compared with the certain benefits and vast\
+  \ good to be secured for the suffering insane...by the consecration and application\
+  \ of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014\
+  Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of\
+  \ the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\n\
+  Dorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C)\
+  \ Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on us history for help. Both Dorothea Dix and Clara barton are American\
+  \ nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\
+  \"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much\
+  \ Speculation & a great diversity of Sentiments in this and neighboring Governments;\
+  \ some vindicating & others condemning it; some charitably alleviating the Crime,\
+  \ & others maliciously painting it in the most odious & detestable Colours, we think\
+  \ it our duty to lay before the Publick, the whole Matter as it appeared, & still\
+  \ appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable\
+  \ Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend\
+  \ them & an utter insensibility to human Distresses, let us consider a few more\
+  \ recent Facts. When we found the last Summer that we were likely to get no Assistance\
+  \ from the Government, some Volunteers went out at our own Expense, determined to\
+  \ drive our Enemies from our Borders; & when we came near to the great Island, we\
+  \ understood that a Number of their Warriors had gone out against our Frontiers.\
+  \ Upon this we returned and came up with them and fought with them at the Munfey\
+  \ Hill where we lost some of our Men & killed some of their Warriors & thereby saved\
+  \ our Frontiers from this Story in another Expedition. But no sooner had we destroyed\
+  \ their Provisions on the great Island, & ruined their trade with the good People\
+  \ at Bethlehem, but these very Indians, who were justly suspected of having murdered\
+  \ our Friends in Northampton County, were by the Influence of some Quakers taken\
+  \ under the Protection of the Government to screen them from the Resentments of\
+  \ the Friends and Relations of the Murdered, & to support them thro the Winter.\"\
+  \n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this\
+  \ context should be read as an explanation, not an admission of guilt or regret.\n\
+  The sentiments expressed in the explanation above reflect which of the ongoing tensions\
+  \ during the colonial period of American history?\n(A) Tensions between British\
+  \ policies and the aspirations of North American colonists. (B) Tensions between\
+  \ American Indians allied with the French and those allied with the British. (C)\
+  \ Tensions between freed African Americans and white planters. (D) Tensions between\
+  \ backcountry settlers and elites within colonial America.\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on us history for help. After the French\
+  \ and Indian War, the Scotch-Irish settlers attacked American Indians. After the\
+  \ attacks on the Conestoga, about 250 Paxton Boys present their grievances to the\
+  \ Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited\
+  \ resentiment at local elites. The answer is (D).\n\nQ: This question refers to\
+  \ the following information.\nOur leaders talk about stopping aggression from the\
+  \ north, but this was a struggle among groups of Vietnamese until we intervened.\
+  \ We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill\
+  \ them and demolish their country to do it. As the native people survey bombed-out\
+  \ villages, women and children burned by napalm, rice crops destroyed and cities\
+  \ overrun with our military personnel, they are doubtless saying secretly of the\
+  \ Vietcong guerillas and of the American forces, \"A plague on both your houses.\"\
+  \ \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps,\
+  \ and confine our military action to holding operations on the ground. Bombing the\
+  \ north has failed to halt or seriously check the flow of troops to the south and\
+  \ may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator\
+  \ George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following\
+  \ opinions from the 1960s most directly reflects the perspective of George McGovern's\
+  \ speech?\n(A) Americans must maximize their technological edge in Vietnam. (B)\
+  \ American bombing in Vietnam is step by step leading to progress in the war. (C)\
+  \ American bombing in Vietnam is a failure. (D) America must not give in to defeatism\
+  \ about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on us history for help. \"Stop the bombing\" and \"Bombing the north\
+  \ has failed to halt or seriously check the flow of troops to the south\" indicate\
+  \ that the perspective of George McGovern's speech is that Amerian bombing in Vietnam\
+  \ is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\
+  \"In the new Code of Laws which I suppose it will be necessary for you to make I\
+  \ desire you would Remember the Ladies, and be more generous and favorable to them\
+  \ than your ancestors. Do not put such unlimited power into the hands of the Husbands.\
+  \ Remember all Men would be tyrants if they could. If particular care and attention\
+  \ is not paid to the Ladies we are determined to foment a Rebellion, and will not\
+  \ hold ourselves bound by any Laws in which we have no voice, or Representation.\"\
+  \nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman\
+  \ has placed us in a most anomalous position. Women invested with the rights of\
+  \ citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary\
+  \ line, are subjects in the next. In some States, a married woman may hold property\
+  \ and transact business in her own name; in others, her earnings belong to her husband.\
+  \ In some States, a woman may testify against her husband, sue and be sued in the\
+  \ courts; in others, she has no redress in case of damage to person, property, or\
+  \ character. In case of divorce on account of adultery in the husband, the innocent\
+  \ wife is held to possess no right to children or property, unless by special decree\
+  \ of the court. But in no State of the Union has the wife the right to her own person,\
+  \ or to any part of the joint earnings of the co-partnership during the life of\
+  \ her husband. In some States women may enter the law schools and practice in the\
+  \ courts; in others they are forbidden. In some universities girls enjoy equal educational\
+  \ advantages with boys, while many of the proudest institutions in the land deny\
+  \ them admittance, though the sons of China, Japan and Africa are welcomed there.\
+  \ But the privileges already granted in the several States are by no means secure.\"\
+  \nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments\
+  \ expressed in the second excerpt by Susan B. Anthony are most likely in support\
+  \ of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D)\
+  \ prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us\
+  \ history for help. The above information mentioned that women are in an anomalous\
+  \ position in terms of legislation. Women's earnings do not belong to themselves,\
+  \ or they cannot testify against her husbands. Susan believes women should have\
+  \ equal legal rights as men. The answer is (B).\n\nQ: This question refers to the\
+  \ following information.\n\"Society in every state is a blessing, but government\
+  \ even in its best state is but a necessary evil; in its worst state an intolerable\
+  \ one; for when we suffer, or are exposed to the same miseries by a government,\
+  \ which we might expect in a country without government, our calamity is heightened\
+  \ by reflecting that we furnish the means by which we suffer. Government, like dress,\
+  \ is the badge of lost innocence; the palaces of kings are built on the ruins of\
+  \ the bowers of paradise. For were the impulses of conscience clear, uniform, and\
+  \ irresistibly obeyed, man would need no other lawgiver; but that not being the\
+  \ case, he finds it necessary to surrender up a part of his property to furnish\
+  \ means for the protection of the rest; and this he is induced to do by the same\
+  \ prudence which in every other case advises him out of two evils to choose the\
+  \ least. Wherefore, security being the true design and end of government, it unanswerably\
+  \ follows that whatever form thereof appears most likely to ensure it to us, with\
+  \ the least expense and greatest benefit, is preferable to all others.\"\nThomas\
+  \ Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above\
+  \ were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized\
+  \ response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal\
+  \ response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\n\
+  A: Let's think step by step. We refer to Wikipedia articles on us history for help.\
+  \ Anti-Federalists do not believe centralized government power, and suspect Washington's\
+  \ military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion\
+  \ happen before the Revolution and they can be ruled out. The answer is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
new file mode 100644
index 00000000..1cf4bbdb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
@@ -0,0 +1,82 @@
+dataset_name: high_school_world_history
+description: "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\nQ: This question refers to the following information.\n\
+  \"At least one of the [world's] societies would have to somehow enormously increase\
+  \ its productivity [in order to achieve global hegemony]. That quantum jump would\
+  \ have to be made before the various scientific, technological, agricultural, and\
+  \ industrial revolutions on which our post-quantum-leap world rests. It could only\
+  \ be accomplished by exploiting the ecosystems, mineral resources, and human assets\
+  \ of whole continents outside the lands of the society making the jump. Western\
+  \ Europe did just that by means of its brutality and guns and, more important, by\
+  \ geographical and ecological luck.\"\nCopyright \xA9 2015 Cambridge University\
+  \ Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum\
+  \ jump\" mentioned in the passage most directly contributed to which of the following\
+  \ developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes\
+  \ through the collapse of the established state structure (B) An increase in the\
+  \ population of the world through more plentiful supplies of food (C) The spread\
+  \ of Chinese and Indian belief systems across the world (D) An increase in social\
+  \ unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history\
+  \ for help. The \"quantum jump\" mentioned in the passage refers to the conquest\
+  \ of the New World and the Columbian Exchange. Choice (A) and (C) did not happen\
+  \ in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This\
+  \ question refers to the following information.\n\"The struggle against neo-colonialism\
+  \ is not aimed at excluding the capital of the developed world from operating in\
+  \ less developed countries. It is aimed at preventing the financial power of the\
+  \ developed countries being used in such a way as to impoverish the less developed.\n\
+  Non-alignment, as practiced by Ghana and many other countries, is based on co-operation\
+  \ with all States whether they be capitalist, socialist or have a mixed economy.\
+  \ Such a policy, therefore, involves foreign investment from capitalist countries,\
+  \ but it must be invested in accordance with a national plan drawn up by the government\
+  \ of the non-aligned State with its own interests in mind. The issue is not what\
+  \ return the foreign investor receives on his investments\u2026The question is one\
+  \ of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\
+  \nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best\
+  \ context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization\
+  \ (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on world history for help. The passage expresses\
+  \ a point that the successful fight against neo-colonialism were in danger and the\
+  \ newly independent nations like Ghana may be re-colonized via financial power of\
+  \ the developed countries. The answer is (B).\n\nQ: This question refers to the\
+  \ following information.\n\"Indeed, as both the fatwas of distinguished [scholars]\
+  \ who base their opinion on reason and tradition alike and the consensus of the\
+  \ Sunni community agree that the ancient obligation of extirpation, extermination,\
+  \ and expulsion of evil innovation must be the aim of our exalted aspiration, for\
+  \ \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in\
+  \ accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces\
+  \ evil innovation into our order must be expelled\" and \"Whosoever does aught against\
+  \ our order must be expelled,\" action has become necessary and exigent\u2026\"\n\
+  Letter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from\
+  \ Selim I is most clearly an example of which of the following?\n(A) The maintenance\
+  \ of military supremacy at all costs (B) Expanding tensions between religious sects\
+  \ (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking\
+  \ efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on world history for help. The passage is an example of expanding tensions\
+  \ between Selim and Ismail. In the passage the Selim references the fatwa and the\
+  \ consensus of the Sunni community to against whosoever introduces evil. The answer\
+  \ is (B).\n\nQ: This question refers to the following information.\n\"The real grievance\
+  \ of the worker is the insecurity of his existence; he is not sure that he will\
+  \ always have work, he is not sure that he will always be healthy, and he foresees\
+  \ that he will one day be old and unfit to work. If he falls into poverty, even\
+  \ if only through a prolonged illness, he is then completely helpless, exam_ins\
+  \ to his own devices, and society does not currently recognize any real obligation\
+  \ towards him beyond the usual help for the poor, even if he has been working all\
+  \ the time ever so faithfully and diligently. The usual help for the poor, however,\
+  \ leaves a lot to be desired, especially in large cities, where it is very much\
+  \ worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely\
+  \ made this speech in reaction to which of the following issues?\n(A) Social acceptance\
+  \ of child labor (B) Declining life expectancy in Germany (C) Criticisms of German\
+  \ trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on world history for help.\
+  \ The passage talks about the grievance of the work under the industrial capitalism.\
+  \ The answer is (D).\n\nQ: This question refers to the following information.\n\
+  He contains all works and desires and all perfumes and all tastes. He enfolds the\
+  \ whole universe and in silence is loving to all. This is the Spirit that is in\
+  \ my heart, this is Brahman. To him I shall come when I go beyond this life, and\
+  \ to him will come he who has faith and doubts not.\n\u2014The Upanishads, India,\
+  \ c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism\
+  \ (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on world history for help. Brahman refers to the ultimate\
+  \ reality of all things in the Hindu religion. In contrast, Buddhism does not have\
+  \ a concept of supreme God. The answer is (A)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
new file mode 100644
index 00000000..9d652132
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
@@ -0,0 +1,48 @@
+dataset_name: human_aging
+description: 'The following are multiple choice questions (with answers) about human
+  aging.
+
+
+  Q: All other things being equal, which of the following persons is more likely to
+  show osteoporosis?
+
+  (A) An older Hispanic American woman (B) An older African American woman (C) An
+  older Asian American woman (D) An older Native American woman
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
+  help. Although osteoporosis can occur at any age, the risk is higher for older people.
+  It is most common in Asian and non-Hispanic white women. The answer is (C).
+
+
+  Q: The finding that adults tend to remember events from their adolescence better
+  than from other periods in their lives is referred to as the
+
+  (A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic
+  retrieval spike
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
+  help. Reminiscence bump is a phenomenon that older adults tend to recollect events
+  during their young ages. People usually have a period of childhood amnesia from
+  birth to around age 5, and a reminiscence bump between 10 and 30. The answer is
+  (B).
+
+
+  Q: Which element in tobacco smoke is responsible for cancers?
+
+  (A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
+  help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and
+  cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).
+
+
+  Q: When older adults move to a new state after retirement, which of the following
+  is the more likely destination?
+
+  (A) Texas (B) California (C) Hawaii (D) Vermont
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
+  help. Texas does not have state tax, and has low cost of living compared with the
+  other three options. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
new file mode 100644
index 00000000..6b7a12cc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
@@ -0,0 +1,61 @@
+dataset_name: human_sexuality
+description: 'The following are multiple choice questions (with answers) about human
+  sexuality.
+
+
+  Q: The most common disorder among men who seek sexual therapy is:
+
+  (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory
+  disorder
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
+  for help. The most common type of arousal disorder for men is erectile dysfunction,
+  meaning that a person is not able to get any physical satisfaction from sexual activity
+  although he may be interested in it. The answer is (C).
+
+
+  Q: A woman who knows she has active herpes and untreated syphilis but continues
+  to have sex without informing her partners of her condition has, in psychoanalytic
+  terms:
+
+  (A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
+  for help. A person with weak superego tends to be delinquent, criminal or have antisocial
+  personality. The action of the woman who knows she has active venereal disease but
+  still have sex with her partners indicate she may has antisocial personality. The
+  answer is (B).
+
+
+  Q: The nature of homosexual activities that occur during preadolescence include
+  all but which of the following?
+
+  (A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other''s
+  genitals
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
+  for help. Sexual intercourse is prohibited during preadolescence. The answer is
+  (A).
+
+
+  Q: Women''s ability to have multiple orgasms is primarily due to:
+
+  (A) the fact that they do not have a refractory period. (B) the response of the
+  inner layers of the vagina. (C) having alternating orgasms in different locations.
+  (D) the G-Spot.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
+  for help. The refractory period is the time when a person is not able to be erect
+  or is not interested in sex. The answer is (A).
+
+
+  Q: Morning sickness is typically a problem:
+
+  (A) during the first trimester (B) during the second trimester (C) during the third
+  trimester (D) all through the pregnancy
+
+  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
+  for help. Morning sickness usually begins by nine weeks after conception, corresponding
+  to the first trimester. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
new file mode 100644
index 00000000..655a39e8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
@@ -0,0 +1,80 @@
+dataset_name: international_law
+description: 'The following are multiple choice questions (with answers) about international
+  law.
+
+
+  Q: How the consent to be bound of a State may be expressed?
+
+  (A) The consent of a State to be bound is expressed only by ratification (B) The
+  consent of a state to be bound by a treaty may be expressed by signature, ratification,
+  acceptance, approval or accession (C) The consent of a State to be bound is expressed
+  by signature (D) The consent of a State to be bound is expressed by whatever means
+  they choose
+
+  A: Let''s think step by step. We refer to Wikipedia articles on international law
+  for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969
+  states that "the consent of a State to be bound by a treaty may be expressed by
+  signature, exchange of instruments constituting a treaty, ratification, acceptance,
+  approval or accession, or by any other means if so agreed." (B) is the most precise
+  and accurate answer. The answer is (B).
+
+
+  Q: What is the judge ad hoc?
+
+  (A) If a party to a contentious case before the ICJ does not have a national sitting
+  as judge, it is entitled to nominate someone as a judge solely for that case, with
+  the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ
+  with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified
+  or passes away (D) Judge ad hoc is the judge that each party will always nominate
+  in every contentious case
+
+  A: Let''s think step by step. We refer to Wikipedia articles on international law
+  for help. As "ad hoc" implies, a judge ad hoc is appointed only for a specific case
+  or period, when a party to a contentious case before the International Court of
+  Justice does not have a regular national sitting as judge. The answer is (A).
+
+
+  Q: When ''consent'' can serve as a circumstance precluding the wrongfulness of a
+  State conduct?
+
+  (A) Consent can serve as a circumstance precluding the wrongfulness whenever it
+  is given (B) Consent can never serve as a circumstance precluding wrongfulness (C)
+  Consent can serve as a circumstance precluding wrongfulness, provided the consent
+  is valid and to the extent that the conduct remains within the limits of the consent
+  given (D) Consent can always serve as a circumstance precluding wrongfulness, no
+  matter which organ of the State gives it
+
+  A: Let''s think step by step. We refer to Wikipedia articles on international law
+  for help. Valid consent can serve as a circumstance precluding the wrongfulness
+  of a State conduct if the conduct remains within the limits of that consent, according
+  to Chapter V of the Responsibility of States for Internationally Wrongful Acts,
+  2001, United Nations. The answer is (C).
+
+
+  Q: Would a reservation to the definition of torture in the ICCPR be acceptable in
+  contemporary practice?
+
+  (A) This is an acceptable reservation if the reserving country''s legislation employs
+  a different definition (B) This is an unacceptable reservation because it contravenes
+  the object and purpose of the ICCPR (C) This is an unacceptable reservation because
+  the definition of torture in the ICCPR is consistent with customary international
+  law (D) This is an acceptable reservation because under general international law
+  States have the right to enter reservations to treaties
+
+  A: Let''s think step by step. We refer to Wikipedia articles on international law
+  for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable
+  reservation in contemporary practice. The answer is (B).
+
+
+  Q: What types of force does Article 2(4) of the UN Charter prohibit?
+
+  (A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types
+  of force, including sanctions (C) Article 2(4) encompasses all interference in the
+  domestic affairs of States (D) Article 2(4) encompasses force directed only against
+  a State''s territorial integrity
+
+  A: Let''s think step by step. We refer to Wikipedia articles on international law
+  for help. Article 2(4) of the UN Charter prohibits states from using armed forces
+  in their international relations. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_international_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
new file mode 100644
index 00000000..7e11f0f7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
@@ -0,0 +1,69 @@
+dataset_name: jurisprudence
+description: 'The following are multiple choice questions (with answers) about jurisprudence.
+
+
+  Q: Iverson Jewelers wrote a letter to Miller, ''We have received an exceptionally
+  fine self winding Rolox watch which we will sell to you at a very favorable price.''
+
+  (A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C)
+  The letter contains a valid offer which will terminate within a reasonable time.
+  (D) The letter lacks one of the essential elements of an offer.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
+  help. An offer shows the intent to enter into a mutually-beneficial contract with
+  specific terms. An offer can be made by a letter. While this letter indicates the
+  willingness to sell, the lack of specific terms, such as transaction price and offer
+  expiration date, makes it an incomplete offer. The answer is (D).
+
+
+  Q: Functions of the law include all but which of the following?
+
+  (A) maximizing individual freedom (B) providing a basis for compromise (C) keeping
+  the peace (D) promoting the principles of the free enterprise system
+
+  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
+  help. Laws are fundamentally about helping resolve disputes between individuals,
+  and therefore essential for maximizing individual freedom, providing a basis for
+  compromise, and keeping the peace. The answer is (D).
+
+
+  Q: The ________ School of jurisprudence postulates that the law is based on what
+  is "correct."
+
+  (A) Natural Law (B) Analytical (C) Historical (D) Sociological
+
+  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
+  help. Natural Law School of jurisprudence focuses on the laws of nature, and states
+  that the law should be based on ethics, morals, and what is "correct". Analytical
+  deals with the law as it already exists, Historical postulates that the law was
+  found and not made, and Sociological studies how the law and society impact each
+  other. The answer is (A).
+
+
+  Q: Which word best summarizes Weber''s explanation of the development of formally
+  rational law?
+
+  (A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
+  help. Weber explained the development of formal rationality in laws as how the modern
+  society moved from tradition to rationality, where people decide actions based less
+  on how they were culturally done and more on expected utilities. How rational individuals
+  optimize efficiency of accomplishing tasks for higher rewards is a core principle
+  of Capitalism. The answer is (D).
+
+
+  Q: Which position does Rawls claim is the least likely to be adopted by the POP
+  (people in the original position)?
+
+  (A) The POP would choose equality above liberty. (B) The POP would opt for the ''maximin''
+  strategy. (C) The POP would opt for the ''difference principle''. (D) The POP would
+  reject the ''system of natural liberty.''
+
+  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
+  help. The POP would opt for the ''maximin'' strategy, opt for the ''difference principle'',
+  and reject the ''system of natural liberty'', but the POP would not choose equality
+  above liberty, since the POP assume both equal and free citizens. The answer is
+  (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
new file mode 100644
index 00000000..f6f3c359
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
@@ -0,0 +1,71 @@
+dataset_name: logical_fallacies
+description: 'The following are multiple choice questions (with answers) about logical
+  fallacies.
+
+
+  Q: When an arguer causes confusion during refutation because of real or feigned
+  lack of an ability to engage in refutation, that arguer may have committed the fallacy
+  of
+
+  (A) poor sportsmanship (B) appeal to compassion (C) argument against the person
+  (D) ignorance of refutation
+
+  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
+  for help. Ignorance of refutation, one of Aristotle''s original list of logical
+  fallacies in his Organon, is when someone causes confusion in an argument through
+  real or feigned inability to engage in refutation, in order to win the argument.
+  The answer is (D).
+
+
+  Q: The complex question fallacy consists of
+
+  (A) arguing something is inferior just because it doesn''t do something it was never
+  intended to do. (B) including more than one claim in the proposition and treating
+  proof for one claim as proof for all the claims. (C) drawing a conclusion before
+  examining the evidence, and only considering evidence that supports that conclusion.
+  (D) asking a question that includes either an unproven assumption or more than one
+  question, thus making a straightforward yes or no answer meaningless.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
+  for help. The complex question fallacy is when someone makes a single yes or no
+  answer to a question meaningless, by including either an unproven assumption or
+  many questions. The latter is also known as the many questions fallacy. The answer
+  is (D).
+
+
+  Q: Arguing that what is true of the parts must be true of the whole is the fallacy
+  of...
+
+  (A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance
+
+  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
+  for help. Fallacy of composition occurs when someone argues what is true of the
+  parts must be true of the whole. The answer is (B).
+
+
+  Q: Which of the following is true of a valid categorical syllogism?
+
+  (A) The minor premise must deny the antecedent (B) The major premise must affirm
+  the consequent (C) The middle term must be used in at least one premise in a universal
+  or unqualified sense (D) All of the above
+
+  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
+  for help. A valid categorical syllogism must satisfy several conditions: (1) the
+  syllogism must have exactly three terms (2) every term of the syllogism must be
+  used twice exactly, (3) a term may be used only once in any premise, and (4) the
+  middle term must be used in at least one premise in a universal or unqualified sense,
+  etc. Only (C) is true. The answer is (C).
+
+
+  Q: If someone attacks the character of an opposing arguer, instead of responding
+  to that opponent''s arguments, the first person has probably committed which of
+  the following fallacies?
+
+  (A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi
+
+  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
+  for help. The argument against the person fallacy occurs when someone irrelevantly
+  attacks the character of an opposing arguer, instead of addressing that opponent''s
+  arguments. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
new file mode 100644
index 00000000..1856af53
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
@@ -0,0 +1,59 @@
+dataset_name: machine_learning
+description: "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\nQ: Which image data augmentation is most common for natural images?\n\
+  (A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization\
+  \ (D) dithering\nA: Let's think step by step. Data augmentation is used to increase\
+  \ the diversity of images in the training dataset. It is important that natural\
+  \ images are kept natural after being augmented. Vertical flips of images are not\
+  \ natural, so (B) is false. Posterization makes the image look like a poster and\
+  \ and dithering increases color depth. None of these two preserve the natural property.\
+  \ The only natural data augmentation technique is (A). The answer is (A).\n\nQ:\
+  \ Traditionally, when we have a real-valued input attribute during decision-tree\
+  \ learning we consider a binary split according to whether the attribute is above\
+  \ or below some threshold. Pat suggests that instead we should just have a multiway\
+  \ split with one branch for each of the distinct values of the attribute. From the\
+  \ list below choose the single biggest problem with Pat\u2019s suggestion:\n(A)\
+  \ It is too computationally expensive. (B) It would probably result in a decision\
+  \ tree that scores badly on the training set and a testset. (C) It would probably\
+  \ result in a decision tree that scores well on the training set but badly on a\
+  \ testset. (D) It would probably result in a decision tree that scores well on a\
+  \ testset but badly on a training set.\nA: Let's think step by step. Because the\
+  \ input is real valued, it is unlikely that the same values appear both at training\
+  \ and test time. This means that while such a decision tree could yield good performance\
+  \ on the training data, when evaluated on the test data it will perform badly because\
+  \ the decision tree won\u2019t know what to do with numbers that did not appear\
+  \ in the training data. The answer is (C).\n\nQ: You are reviewing papers for the\
+  \ World\u2019s Fanciest Machine Learning Conference, and you see submissions with\
+  \ the following claims. Which ones would you consider accepting?\n(A) My method\
+  \ achieves a training error lower than all previous methods! (B) My method achieves\
+  \ a test error lower than all previous methods! (Footnote: When regularisation parameter\
+  \ \u03BB is chosen so as to minimise test error.) (C) My method achieves a test\
+  \ error lower than all previous methods! (Footnote: When regularisation parameter\
+  \ \u03BB is chosen so as to minimise cross-validaton error.) (D) My method achieves\
+  \ a cross-validation error lower than all previous methods! (Footnote: When regularisation\
+  \ parameter \u03BB is chosen so as to minimise cross-validaton error.)\nA: Let's\
+  \ think step by step. In machine learning, we train with some data and fixed hyperparameters\
+  \ and the training error can be arbitrarily low, so (A) can\u2019t be right. Then,\
+  \ one compares different hyperparameters by selecting the model with the lowest\
+  \ cross-validation error, this means that (B) and (D) are not the right procedure.\
+  \ The only relevant number after these is the test error and thus (C) is the right\
+  \ answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results\
+  \ are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times;\
+  \ side 5: 4 times; side 6: 5 times. Based on these results, what is the probability\
+  \ of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16\
+  \ (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one\
+  \ to the different counts and then normalizes the probabilities accordingly. The\
+  \ counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side\
+  \ 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum\
+  \ one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7.\
+  \ The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent\
+  \ of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality\
+  \ the IID test set must have how many examples?\n(A) around 10 examples (B) around\
+  \ 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA:\
+  \ Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with\
+  \ 95% probability the in-sample and out-of-sample errors differ by epsilon when\
+  \ we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2)\
+  \ log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than\
+  \ 1000 examples. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
new file mode 100644
index 00000000..db2f9642
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
@@ -0,0 +1,54 @@
+dataset_name: management
+description: 'The following are multiple choice questions (with answers) about management.
+
+
+  Q: How can organisational structures that are characterised by democratic and inclusive
+  styles of management be described?
+
+  (A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional
+
+  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
+  Flat organizational structures are characterized by democratic and inclusive styles
+  of management, and have few (if any) levels of management between the workers and
+  managers.  The answer is (C).
+
+
+  Q: Hygiene factors are associated with which writer?
+
+  (A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor
+
+  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
+  Hygiene factors include compensation, company policies, supervision, interpersonal
+  relations, and work environments. Hertzberg lists them as factors that cannot motivate
+  employees but can minimize job dissatisfaction. The answer is (A).
+
+
+  Q: What characteristic is not a key feature of the ''open systems'' model of management?
+
+  (A) Morale (B) Innovation (C) Growth resource (D) Adaptation
+
+  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
+  The key characteristics of an open system in management include innovation, growth
+  resource, and adaption, but do not include morale. The answer is (A).
+
+
+  Q: Which element of the cultural web forms regalia?
+
+  (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems
+
+  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
+  The cultural web is a tool for mapping an organization''s culture, where symbols
+  form the regalia that visually expresses the values that the organization holds
+  as important. The answer is (A).
+
+
+  Q: What are the two main dimensions of the Ohio Studies into leadership?
+
+  (A) Starting position and end position (B) Initial environment and changed environment
+  (C) Organisational structure and conditioning (D) Initiating structure and considerations
+
+  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
+  The Ohio State Leadership Studies conducted in the 1940s identified initiating structure
+  and consideration as the two main dimensions of leader behavior. The answer is (D).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_management
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
new file mode 100644
index 00000000..5dd683da
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
@@ -0,0 +1,66 @@
+dataset_name: marketing
+description: 'The following are multiple choice questions (with answers) about marketing.
+
+
+  Q: Although the content and quality can be as controlled as direct mail, response
+  rates of this medium are lower because of the lack of a personal address mechanism.
+  This media format is known as:
+
+  (A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
+  Door to door marketing delivers non-addressed items within all buildings within
+  a geographic area. While it can control the content and quality as well as direct
+  mail marketing, its response rate is lower because of the lack of a personal address
+  mechanism. The answer is (D).
+
+
+  Q: In an organization, the group of people tasked with buying decisions is referred
+  to as the _______________.
+
+  (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making
+  unit.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
+  In an organization, the group of the people tasked with buying decision is referred
+  to as the decision-making unit. The answer is (D).
+
+
+  Q: The single group within society that is most vulnerable to reference group influence
+  is:
+
+  (A) The older consumer who feels somewhat left out of things. (B) The married women,
+  many of whom feel a need for stability in their lives. (C) New immigrants who really
+  want to assimilate into their new culture. (D) Children, who base most of their
+  buying decisions on outside influences.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
+  Children, who mostly based their buying decisions on outside influences, are the
+  single group within society that is more vulnerable to reference group influence.
+  The answer is (D).
+
+
+  Q: Which of the following is an assumption in Maslow''s hierarchy of needs?
+
+  (A) Needs are dependent on culture and also on social class. (B) Lower-level needs
+  must be at least partially satisfied before higher needs can affect behaviour. (C)
+  Needs are not prioritized or arranged in any particular order. (D) Satisfied needs
+  are motivators, and new needs emerge when current needs remain unmet.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
+  Maslow''s hierarchy of needs, from the bottom upwards, are physiological (food and
+  clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level
+  needs must be at least partially satisfied before higher ones can affect behavior.
+  The answer is (B).
+
+
+  Q: _____________ is a natural outcome when combining demographic and geographic
+  variables.
+
+  (A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
+  Geodemographics is a natural outcome when combining demographic and geographic variables.
+  The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_marketing
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
new file mode 100644
index 00000000..ebf699aa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
@@ -0,0 +1,61 @@
+dataset_name: medical_genetics
+description: 'The following are multiple choice questions (with answers) about medical
+  genetics.
+
+
+  Q: The stage of meiosis in which chromosomes pair and cross over is:
+
+  (A) prophase I (B) metaphase I (C) prophase II (D) metaphase II
+
+  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
+  for help. Prophase I is the stage of meiosis where homologous chromosomes pair with
+  each other and exchange genetic material. The answer is (A).
+
+
+  Q: DNA ligase is
+
+  (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial
+  origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription
+  of specific genes (D) an enzyme which limits the level to which a particular nutrient
+  reaches
+
+  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
+  for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA
+  strands together by catalyzing a phosphodiester bond. The answer is (A).
+
+
+  Q: Which of the following conditions does not show multifactorial inheritance?
+
+  (A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D)
+  Marfan syndrome
+
+  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
+  for help. Multifactorial inheritance is when more than a single factor is responsible
+  for causing a given trait or health problem. Genes cannot be the only factor. Marfan
+  syndrome, on the other hand, requires only one abnormal copy of the of the Marfan
+  gene, from one parent, to inherit the trait. The answer is (D).
+
+
+  Q: A gene showing codominance
+
+  (A) has both alleles independently expressed in the heterozygote (B) has one allele
+  dominant to the other (C) has alleles tightly linked on the same chromosome (D)
+  has alleles expressed at the same time in development
+
+  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
+  for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance
+  where the phenotype of both the parents is easily observed in the offspring. A heterozygote
+  is an individual having two different alleles of a gene. The answer is (A).
+
+
+  Q: Large triplet repeat expansions can be detected by:
+
+  (A) polymerase chain reaction. (B) single strand conformational polymorphism analysis.
+  (C) Southern blotting. (D) Western blotting.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
+  for help. A Southern blot is a method in molecular biology for detecting specific
+  DNA sequences in a sample. Large triplet repeat expansions are usually detected
+  with this method. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
new file mode 100644
index 00000000..a506e940
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
@@ -0,0 +1,54 @@
+dataset_name: miscellaneous
+description: 'The following are multiple choice questions (with answers) about miscellaneous.
+
+
+  Q: Which of these songs was a Top 10 hit for the rock band The Police?
+
+  (A) ''Radio Ga-Ga'' (B) ''Ob-la-di Ob-la-da'' (C) ''De Do Do Do De Da Da Da'' (D)
+  ''In-a-Gadda-Da-Vida''
+
+  A: Let''s think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by
+  Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly.
+  Leaving ''De Do Do Do De Da Da Da'' as the only song by The Police, and also a Top
+  10 hit. The answer is (C).
+
+
+  Q: What place is named in the title of the 1979 live album by rock legends Cheap
+  Trick?
+
+  (A) Budapest (B) Budokan (C) Bhutan (D) Britain
+
+  A: Let''s think step by step. We refer to Wikipedia for help. Nippon Budokan is
+  an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including
+  Cheap Trick in 1978. ''Cheap Trick at Budokan'' became the name of their album.
+  The answer is (B).
+
+
+  Q: What is produced during photosynthesis?
+
+  (A) hydrogen (B) nylon (C) oxygen (D) light
+
+  A: Let''s think step by step. We refer to Wikipedia for help. Photosynthesis is
+  the process in which green plants use the green pigment chlorophyll to synthesize
+  foods with water and carbon dioxide. Oxygen is the byproduct of this process. The
+  answer is (C).
+
+
+  Q: Who is the shortest man to ever win an NBA slam dunk competition?
+
+  (A) Anthony ''Spud'' Webb (B) Michael ''Air'' Jordan (C) Tyrone ''Muggsy'' Bogues
+  (D) Julius ''Dr J'' Erving
+
+  A: Let''s think step by step. We refer to Wikipedia for help. In 1986, Spud Webb,
+  standing only 5''7" became the shortest NBA player in history to win an official
+  slam dunk contest. The answer is (A).
+
+
+  Q: How many axles does a standard automobile have?
+
+  (A) one (B) two (C) four (D) eight
+
+  A: Let''s think step by step. We refer to Wikipedia for help. Most cars have two
+  axles to rotate the wheels.. The answer is (B).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
new file mode 100644
index 00000000..fdde3633
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
@@ -0,0 +1,72 @@
+dataset_name: moral_disputes
+description: 'The following are multiple choice questions (with answers) about moral
+  disputes.
+
+
+  Q: Baron admits that the versions of the ticking bomb hypothetical she discusses
+  are "stunningly stupid," but she claims this is actually evidence of
+
+  (A) the stupidity of most traditional philosophical examples. (B) a general lack
+  of intelligence among people with advanced degrees. (C) the wrongness of torture.
+  (D) the readiness on the part of many intelligent people to see torture as the best
+  solution to deal with terrorism.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
+  help. The ticking bomb hypothetical poses a problem where many people will die to
+  an exploding bomb, if the hypothetical terrorist does not disclose how to defuse
+  it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent
+  people, particularly utilitarians, see torture as justifiable to save the lives
+  in this scenario. The answer is (D).
+
+
+  Q: A fertilized ovum is also known as
+
+  (A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
+  help. Once a single sperm penetrates the layers of an egg to form a new cell, that
+  cell is called a zygote. The answer is (A).
+
+
+  Q: Pence compares six different cases of reproduction, from natural twinning to
+  SCNT. What conclusion does he draw from this comparison?
+
+  (A) SCNT is not a different kind of reproduction because there are no morally relevant
+  differences between it and other permissible means of reproduction. (B) Because
+  there is a low risk of harm for natural twinning, there will be a low risk of harm
+  for SCNT. (C) both A and B (D) neither A nor B
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
+  help. SCNT, known as ''somatic cell nuclear transfer'', is a technique in which
+  the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated
+  egg. This is used in farm animal cloning. Pence may compare this to other processes
+  of reproduction, and see little moral separation. The answer is (A).
+
+
+  Q: Which of the following is an intrinsic good, according to Singer?
+
+  (A) being in the upper class (B) the ability to learn and solve problems (C) being
+  more beautiful than most of the population (D) being shorter than average
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
+  help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by
+  oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable
+  criteria, but capabilities are intrinsically good. The answer is (B).
+
+
+  Q: According to Metz, what is wrong with consequentialist arguments against capital
+  punishment based on African values?
+
+  (A) It is unclear as of yet whether or not capital punishment deters harm to the
+  community. (B) It is unclear as of yet whether or not capital punishment deters
+  harm to any individuals. (C) Consequentialism is not supported by African values.
+  (D) Even though consequentialism is supported by African values, no consequentialist
+  arguments framed in terms of African values have been offered.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
+  help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy
+  in South Africa. Metz has written the death penalty is unjustified as it treats
+  individuals as incapable of communal relations. It is unclear that capital punishment
+  is to the benefit of, or a deterrent of harm to the community. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
new file mode 100644
index 00000000..4c82f5d5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
@@ -0,0 +1,66 @@
+dataset_name: moral_scenarios
+description: 'The following are multiple choice questions (with answers) about moral
+  scenarios.
+
+
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do
+  something clearly morally wrong, according to ordinary moral standards in the US
+  as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario
+  2 | I made love to my newborn baby boy.
+
+  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
+  for help. Stealing from someone is not considered an ethical response to harrassment.
+  And having sex with a minor is egregiously unethical. The answer is (A).
+
+
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do
+  something clearly morally wrong, according to ordinary moral standards in the US
+  as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store.
+  Scenario 2 | When the security guard left his post, I pocketed a rare coin.
+
+  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
+  for help. Asking for something is not morally wrong. Stealing is morally wrong.
+  The answer is (C).
+
+
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do
+  something clearly morally wrong, according to ordinary moral standards in the US
+  as of 2020? Scenario 1 | During the house''s demolition, I pushed him into a wall.
+  Scenario 2 | When I choose a babysitter for my twin nieces, I don''t care about
+  her racial background.
+
+  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
+  for help. Physical violence is morally wrong. Being open to hiring people from different
+  racial backgrounds is an expected moral stance. The answer is (B).
+
+
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do
+  something clearly morally wrong, according to ordinary moral standards in the US
+  as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I
+  looked up at the gorgeous waterfall and threw a rock at it.
+
+  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
+  for help. Neither of these are wrong as no one is wronged by buing a pick-up truck
+  or throwing a rock into a stream. The answer is (D).
+
+
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do
+  something clearly morally wrong, according to ordinary moral standards in the US
+  as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new
+  girl is embarrassed to live in a trailer park, so I invite the class to her house.
+
+  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
+
+  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
+  for help. Loving someone is not wrong. However, exposing something that someone
+  is embarrassed about could be considered quite mean. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
new file mode 100644
index 00000000..eae79250
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
@@ -0,0 +1,72 @@
+dataset_name: nutrition
+description: 'The following are multiple choice questions (with answers) about nutrition.
+
+
+  Q: What is the first-line drug for patients with type 2 diabetes and obesity, as
+  of 2020?
+
+  (A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin
+
+  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
+  Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed
+  for type 2 diabetes, as well as obesity. It works by lowering glucose production
+  in the liver and improving the body''s sensitivity to insulin. The answer is (B).
+
+
+  Q: Which of the following statements is correct (according to knowledge in 2020)?
+
+  (A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame
+  (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin
+  (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose
+  (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame
+  K
+
+  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
+  People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine.
+  As it builds up in the blood and brain it can lead to brain damage. People with
+  PKU should avoid foods that are converted to phenylalanine in the body, such as
+  aspartame. The answer is (A).
+
+
+  Q: Which of the following statements about iodine is correct, as of 2020?
+
+  (A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are
+  a poor source of iodine (C) The iodine content of organic milk is generally lower
+  that the level in non-organic milk (D) UK dietary reference values recommend an
+  increase in iodine intake in pregnancy
+
+  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
+  Organic milk usually has less iodine content than non-organic milk. The answer is
+  (C).
+
+
+  Q: Which of the following is the most plausible explanation for the protective effect
+  of dietary fibre against cancer of the colon, as of 2020?
+
+  (A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty
+  acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates
+  "silencing" of the SLC5A8 tumour suppressor gene (C) None of these options are correct
+  (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant
+  defences in the colon
+
+  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
+  Dietary fibre is inversely proportional to the risk of colorectal cancer. This is
+  presumed because butyric acid (BA) stimulates antioxidants which help protect the
+  colon from cancerous tumors. The answer is (D).
+
+
+  Q: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing
+  consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which
+  one statement is correct (according to knowledge in 2020)?
+
+  (A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers
+  have higher risk of developing diabetes. (C) Based on the information given we cannot
+  tell if the observed difference in disease risk is the result of chance. (D) The
+  risk ratio is close to the value one, so there is no difference in disease risk
+  between the two groups.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
+  The risk ratio is not sufficiently reduced that it could not be explained by random
+  chance given the studies sample size. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
new file mode 100644
index 00000000..60ce6c54
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
@@ -0,0 +1,30 @@
+dataset_name: philosophy
+description: "The following are multiple choice questions (with answers) about philosophy.\n\
+  \nQ: The study of reality in the broadest sense, an inquiry into the elemental nature\
+  \ of the universe and the things in it, is known as _____.\n(A) metaphysics (B)\
+  \ epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics\
+  \ studies the nature of reality and existence. The answer is (A).\n\nQ: According\
+  \ to Moore\u2019s \u201Cideal utilitarianism,\u201D the right action is the one\
+  \ that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good.\
+  \ (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy\
+  \ for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize\
+  \ intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion,\
+  \ what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C)\
+  \ nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on philosophy for help. Before his conversion, Tolstoy feels that life\
+  \ was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ:\
+  \ According to d'Holbach, people always act according to _____.\n(A) free choices\
+  \ (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA:\
+  \ Let's think step by step. We refer to Wikipedia articles on philosophy for help.\
+  \ d'Holbach believes that people act according to necessary laws, and it proves\
+  \ nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism\
+  \ is:\n(A) an ethical theory about how we ought to behave. (B) a generalization\
+  \ concerning the way people tend to behave. (C) a claim about human nature and the\
+  \ ways people are capable of behaving. (D) none of the above.\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on philosophy for help. Psychological\
+  \ egoism suggests that one behaves based on what makes one feels good, hence it\
+  \ is a claim about human nature and how humans are capable of behaving. The answer\
+  \ is (C)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
new file mode 100644
index 00000000..e1c8dcc6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
@@ -0,0 +1,67 @@
+dataset_name: prehistory
+description: 'The following are multiple choice questions (with answers) about prehistory.
+
+
+  Q: What is the approximate mean cranial capacity of Homo erectus?
+
+  (A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc
+
+  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
+  The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer
+  is (C).
+
+
+  Q: According to Timothy Pauketat, the evidence for social stratification and political
+  power at Cahokia suggests:
+
+  (A) a center of Mississippian civilization with conditions similar to the rise of
+  early states. (B) the limitations of authority in a Native American society of egalitarian
+  foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D.
+  1500. (D) a center of Mississippian civilization with conditions similar to societies
+  on the Northwest Coast of North America.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
+  Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian
+  culture, where he found similar conditions to the rise of early states. The answer
+  is (A).
+
+
+  Q: Recent research on hominid species dating from the Middle Pliocene indicates
+  there was (as of 2020):
+
+  (A) a great amount of species diversity, or a single species that exhibited a lot
+  of diversity. (B) very little species diversity during this period and very few
+  hominids. (C) decreased species diversity due to a prolonged ice age followed by
+  a severe drought. (D) decreased species diversity but increased numbers of hammerstones
+  and flakes, indicating stone tool manufacture.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
+  Recent research has recognized multiple hominid species from the Middle Pliocene,
+  meaning that there is a great amount of species diversity or diversity in a single
+  species. The answer is (A).
+
+
+  Q: Researchers now believe that the decline of the Maya was caused chiefly by:
+
+  (A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological
+  degradation resulting from slash-and-burn farming techniques. (C) endless wars between
+  neighboring Mayan city-states. (D) practices of interbreeding that led to a steep
+  rise in congenital disorders.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
+  Researchers believe that the Maya collapse was mainly caused by over-exploitation
+  of natural resources like the slash-and-burn farming techniques. The answer is (B).
+
+
+  Q: The great Mayan king Pacal built temples in the city of Palenque in order to:
+
+  (A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to
+  the common people, since they were allowed to live in the temples. (C) frighten
+  away enemies, in particular the Spaniards. (D) legitimize his kingship, since his
+  father was not royal.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
+  Pacal built the temples as the funerary monument to legitimize his kingship. The
+  answer is (D).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
new file mode 100644
index 00000000..c4957a1f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
@@ -0,0 +1,47 @@
+dataset_name: professional_accounting
+description: "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019\
+  s subledger. Which of the following management assertions is supported by this test?\n\
+  (A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation\
+  \ and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ accounting for help. The completeness assertion is tested by tracing supporting\
+  \ documents to the record entries. The answer is (B).\n\nQ: One hundred years ago,\
+  \ your great-great-grandmother invested $100 at 5% yearly interest. What is the\
+  \ investment worth today?\n(A) $13,000 (B) $600 (C) $15,000 (D) $28,000\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on accounting for help. A $100\
+  \ investment at 5% yearly interest is worth 100*(1.05)^100=13150 after 100 years,\
+  \ which is around $13,000. The answer is (A).\n\nQ: On January 1, year 1, Alpha\
+  \ Co. signed an annual maintenance agreement with a software provider for $15,000\
+  \ and the maintenance period begins on March 1, year 1. Alpha also incurred $5,000\
+  \ of costs on January 1, year 1, related to software modification requests that\
+  \ will increase the functionality of the software. Alpha depreciates and amortizes\
+  \ its computer and software assets over five years using the straight-line method.\
+  \ What amount is the total expense that Alpha should recognize related to the maintenance\
+  \ agreement and the software modifications for the year ended December 31, year\
+  \ 1?\n(A) $5,000 (B) $13,500 (C) $16,000 (D) $20,000\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on accounting for help. The maintenance period\
+  \ begins on March 1, so only 10 months of expenses should be recognized, which is\
+  \ $15,000/12*10=$12,500. The software modification cost is amortized over 5 years,\
+  \ so each year is $5,000/5=$1,000. So the total expense is $12,500+$1,000=$13,500.\
+  \ The answer is (B).\n\nQ: Krete is an unmarried taxpayer with income exclusively\
+  \ from wages. By December 31, year 1, Krete's employer has withheld $16,000 in federal\
+  \ income taxes and Krete has made no estimated tax payments. On April 15, year 2,\
+  \ Krete timely filed for an extension request to file her individual tax return,\
+  \ and paid $300 of additional taxes. Krete's year 1 tax liability was $16,500 when\
+  \ she timely filed her return on April 30, year 2, and paid the remaining tax liability\
+  \ balance. What amount would be subject to the penalty for underpayment of estimated\
+  \ taxes?\n(A) $0 (B) $500 (C) $1,650 (D) $16,500\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on accounting for help. The tax due after withholding\
+  \ is $16,500-$16,000=$500, which is less than $1000, hence there is no underpayment\
+  \ penalty of estimated taxes. The answer is (A).\n\nQ: Box a nongovernmental not-for-profit\
+  \ organization had the following transactions during the year: Proceeds from sale\
+  \ of investments $80000 Purchase of property plant and equipment $10000 Proceeds\
+  \ from long-term debt $100000 Loss on sale of investment $5000 What amount should\
+  \ be reported as net cash provided by financing activities in Box's statement of\
+  \ cash flows?\n(A) $70,000 (B) $75,000 (C) $80,000 (D) 100000\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on accounting for help. Among the four\
+  \ transactions, only Proceeds from long-term debt belongs to the financing activities\
+  \ section of cashflow, hence the amount reported should be $100000. The answer is\
+  \ (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
new file mode 100644
index 00000000..f3a957db
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
@@ -0,0 +1,105 @@
+dataset_name: professional_law
+description: "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor\
+  \ and told him that he wanted to pay the son's debt. The father signed a document\
+  \ that stated the father would pay the son's debt at a rate of $500 a month for\
+  \ 10 months. The creditor made no written or oral commitment to forbear to sue the\
+  \ son to collect the $5,000 debt, and the father made no oral or written request\
+  \ for any such forbearance. For the next five months, the father made and the creditor\
+  \ accepted the $500 monthly payments as agreed. During that period, the creditor,\
+  \ in fact, did forbear to take any legal action against the son. However, the father\
+  \ then informed the creditor that he would make no further payments on the debt.\
+  \ Which of the following is the most persuasive argument that the father is liable\
+  \ to the creditor under the terms of their agreement?\n(A) The father's promise\
+  \ and the creditor's reliance thereon, if proved, gave rise to a valid claim by\
+  \ the creditor against the father based on the doctrine of promissory estoppel.\
+  \ (B) Because it was foreseeable that the father's promise would induce the creditor\
+  \ to forbear taking any action against the son, such forbearance was, as a matter\
+  \ of law, a bargained-for consideration for the father's promise. (C) The father's\
+  \ five payments to the creditor totaling $2,500 manifested a serious intent on the\
+  \ father's part to be contractually bound, and such manifestation is generally recognized\
+  \ as an effective substitute for consideration. (D) By assuming the antecedent debt\
+  \ obligation that the son owed to the creditor, the father became a surety whose\
+  \ promise to the creditor was enforceable, since it was in writing and supported\
+  \ by adequate consideration. \nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on law for help. The doctrine of promissory estoppel stops a person from\
+  \ going back on a promise in contract law, hence option (A) should be the most persuasive\
+  \ argument. The answer is (A).\n\nQ: A state has recently enacted a statute prohibiting\
+  \ the disposal of any nuclear wastes within the state. This law does not contravene\
+  \ or conflict with any federal statutes. A man operates a company in the state that\
+  \ is engaged in the disposal of nuclear wastes. Subsequent to the passage of the\
+  \ state statute, the man, not yet aware of the new law, entered into contracts with\
+  \ many out-of-state firms to dispose of their nuclear wastes in the state. On account\
+  \ of this new law, however, the man will be unable to perform these contracts. Assume\
+  \ that the man has standing to challenge this state law. Which of the following\
+  \ presents his strongest constitutional grounds to challenge the state law prohibiting\
+  \ the disposal of nuclear wastes within the state?\n(A) The commerce clause. (B)\
+  \ The equal protection clause of the Fourteenth Amendment. (C) The privileges and\
+  \ immunities clause of Article IV, Section 2. (D) The contract clause.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on law for help. The commerce\
+  \ clause states that Congress shall have the power to regulate commerce with foreign\
+  \ Nations, and among the several States, and with the Indian Tribes. The statute\
+  \ affects inter-state commerce which puts it into question. Hence the man's strongest\
+  \ argument should be the commerce clause. The answer is (A).\n\nQ: On October 1,\
+  \ 1980, a developer, owner of several hundred acres in a rural county, drafted a\
+  \ general development plan for the area. The duly recorded plan imposed elaborate\
+  \ limitations and restrictions upon the land in the plan, which was to be developed\
+  \ as a residential district. The restrictions were to extend to all persons acquiring\
+  \ any of the lots and to their heirs, assigns, and lessees. It was further provided\
+  \ that all subsequent owners would be charged with due notice of the restrictions.\
+  \ Among those restrictions in the general plan were the following:(22) A franchise\
+  \ right is created in a strip of land 10 feet in width along the rear of each lot\
+  \ for the use of public utility companies with right of ingress and egress. (23)\
+  \ No house or structure of any kind shall be built on the aforementioned strip of\
+  \ land running through the said blocks. In 2000, a retiree purchased one of the\
+  \ lots, built a house, and erected a fence in the rear of his property within the\
+  \ restricted area. In 2004, a teacher purchased a lot adjacent to the retiree's\
+  \ property and built a new house. Two years later, a librarian purchased the lot\
+  \ that adjoined the teacher's property. The three deeds to those properties each\
+  \ contained references to the deed book where the general plan was recorded. In\
+  \ 2008, the librarian began the construction of a seven-foot post-and-rail fence\
+  \ along the line dividing his lot with the teacher's, and along the center of the\
+  \ area subject to the franchise right. Although the teacher objected to its construction,\
+  \ the fence was completed. If the teacher seeks a mandatory injunction to compel\
+  \ removal of the librarian's fence, the court will most likely\n(A) grant relief,\
+  \ because the fence was in violation of the easement restriction. (B) grant relief,\
+  \ because the encroachment of the fence violated the restriction in the original\
+  \ plan. (C) deny relief, because the teacher failed to enforce the restriction against\
+  \ the retiree. (D) deny relief, because the fence would not be construed as \"a\
+  \ structure\" within the terms of the restriction. \nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on law for help. The restrictions in the original\
+  \ plan say no house or structure of any kind shall be built on the aforementioned\
+  \ strip of land running through the said blocks. Hence the court will most likely\
+  \ grant relief because the fence violated the restriction in the original plan.\
+  \ The answer is (B).\n\nQ: Judge took judicial notice of some facts at the beginning\
+  \ of the trial. Which of the following is not an appropriate kind of fact for judicial\
+  \ notice?\n(A) Indisputable facts. (B) Facts that have been asserted by individual\
+  \ political organizations. (C) Facts recognized to be true by common knowledge.\
+  \ (D) Facts capable of scientific verification.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on law for help. Among the options, facts that have\
+  \ been asserted by individual political organizations is not an appropriate kind\
+  \ of fact for judicial notice. The answer is (B).\n\nQ: A state legislature has\
+  \ recently enacted a statute making it a misdemeanor to curse or revile or use obscene\
+  \ or opprobrious language toward or in reference to a police officer perfonning\
+  \ his duties. A student at a state university organized a demonstration on campus\
+  \ to protest the war. The rally was attended by a group of 50 students who shouted\
+  \ anti-war messages at cars passing by. To show his contempt for the United States,\
+  \ the student sewed the American flag to the rear of his jeans. When a police officer\
+  \ saw the flag sown on the student's jeans, he approached and told him to remove\
+  \ the flag or he would be placed under arrest. The student became angered and shouted\
+  \ at the police officer, \"Listen, you bastard, I'll wear this rag anywhere I please.\
+  \ \" The student was subsequently placed under arrest and charged with violating\
+  \ the state statute. The student subsequently brings suit in state court challenging\
+  \ the constitutionality of the statute. The strongest constitutional argument for\
+  \ the student is that\n(A) the statute is void for vagueness under the Fourteenth\
+  \ Amendment's due process clause. (B) the statute is invalid because it violates\
+  \ the petitioner's freedom of speech under the First Amendment. (C) the statute\
+  \ is an abridgment of freedom of speech under the First Amendment because less restrictive\
+  \ means are available for achieving the same purpose. (D) the statute is overbroad\
+  \ and consequently invalid under the First and Fourteenth Amendments.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on law for help. The Fourteenth\
+  \ Amendment further supports the First Amendment by establishing a due process clause.\
+  \ Hence the strongest argument should be the statute is overbroad and consequently\
+  \ invalid under the First and Fourteenth Amendments. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
new file mode 100644
index 00000000..89b890f8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
@@ -0,0 +1,69 @@
+dataset_name: professional_medicine
+description: "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with\
+  \ the complaint of right-sided rib pain when he runs long distances. Physical examination\
+  \ reveals normal heart and lung findings and an exhalation dysfunction at ribs\_\
+  4-5 on the right. Which of the following muscles or muscle groups will be most useful\
+  \ in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene\
+  \ (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on medicine for help. Among the options,\
+  \ only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th\
+  \ ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with\
+  \ a\_3-week\_history of low back pain. He denies any recent trauma but says that\
+  \ he climbs in and out of his truck numerous times a day for his job. Examination\
+  \ of the patient in the prone position reveals a deep sacral sulcus on the left,\
+  \ a posterior inferior lateral angle on the right, and a lumbosacral junction that\
+  \ springs freely on compression. The most likely diagnosis is\n(A) left-on-left\
+  \ sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion\
+  \ (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medicine for help. The deep sulcus on the left, a posterior ILA on\
+  \ the right, with a negative spring test suggests a right-on-right sacral torsion.\
+  \ All other options have a deep sulcus on the right. The answer is (D).\n\nQ: A\
+  \ 44-year-old man comes to the office because of a 3-day history of sore throat,\
+  \ nonproductive cough, runny nose, and frontal headache. He says the headache is\
+  \ worse in the morning and ibuprofen does provide some relief. He has not had shortness\
+  \ of breath. Medical history is unremarkable. He takes no medications other than\
+  \ the ibuprofen for pain. Vital signs are temperature 37.4\xB0C (99.4\xB0F), pulse\
+  \ 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the\
+  \ nares shows erythematous mucous membranes. Examination of the throat shows erythema\
+  \ and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable\
+  \ cervical adenopathy. Lungs are clear to auscultation. Which of the following is\
+  \ the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr\
+  \ virus (C) Mycoplasma pneumonia (D) Rhinovirus\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on medicine for help. The symptoms, especially the\
+  \ headache, suggest that the most likely cause is Rhinovirus. Epstein-Barr virus\
+  \ will cause swollen lymph nodes but there is no palpable cervical adenopathy. Lungs\
+  \ are clear to auscultation suggests it's not Mycoplasma pneumonia. The answer is\
+  \ (D).\n\nQ: A previously healthy 32-year-old woman comes to the physician 8 months\
+  \ after her husband was killed in a car crash. Since that time, she has had a decreased\
+  \ appetite and difficulty falling asleep. She states that she is often sad and cries\
+  \ frequently. She has been rechecking the door lock five times before leaving her\
+  \ house and has to count exactly five pieces of toilet paper before she uses it.\
+  \ She says that she has always been a perfectionist but these urges and rituals\
+  \ are new. Pharmacotherapy should be targeted to which of the following neurotransmitters?\n\
+  (A) Dopamine (B) Glutamate (C) Norepinephrine (D) Serotonin\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on medicine for help. The patient feels\
+  \ sad and among the options, only Dopamine and Serotonin can help increase positive\
+  \ emotions. Serotonin also affects digestion and metabolism, which can help the\
+  \ patient's decreased appetite and sleep difficulty. The answer is (D).\n\nQ: A\
+  \ 42-year-old man comes to the office for preoperative evaluation prior to undergoing\
+  \ adrenalectomy scheduled in 2 weeks. One month ago, he received care in the emergency\
+  \ department for pain over his right flank following a motor vehicle collision.\
+  \ At that time, blood pressure was 160/100 mm Hg and CT scan of the abdomen showed\
+  \ an incidental 10-cm left adrenal mass. Results of laboratory studies, including\
+  \ complete blood count, serum electrolyte concentrations, and liver function tests,\
+  \ were within the reference ranges. The patient otherwise had been healthy and had\
+  \ never been told that he had elevated blood pressure. He takes no medications.\
+  \ A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine\
+  \ and metanephrine and plasma aldosterone concentrations. The patient was referred\
+  \ to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature\
+  \ 36.6\xB0C (97.9\xB0F), pulse 100/min, respirations 14/min, and blood pressure\
+  \ 170/95 mm Hg. Physical examination discloses no significant findings. Initial\
+  \ preoperative preparation should include treatment with which of the following?\n\
+  (A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\n\
+  A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
+  \ The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure\
+  \ indicates hypertension. Phenoxybenzamine is used to treat hypertension caused\
+  \ by pheochromocytoma. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
new file mode 100644
index 00000000..e1e5206d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
@@ -0,0 +1,47 @@
+dataset_name: professional_psychology
+description: "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\nQ: In the construction of a multiple regression equation for purposes\
+  \ of prediction, the optimal combination of measures is one in which the predictors\n\
+  (A) are uncorrelated with each other but are moderately correlated with the criterion\
+  \ (B) have low correlations with each other and low correlations with the criterion\
+  \ (C) are highly intercorrelated with each other and moderately correlated with\
+  \ the criterion (D) have low correlations with the criterion bur are moderately\
+  \ correlated with each other\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on psychology for help. The basis of multiple regression is to assess\
+  \ the relationship between one continuous variable and a set of independent variables.\
+  \ So the predictors should be uncorrelated with each other but are moderately correlated\
+  \ with the criterion. The answer is (A).\n\nQ: There are three ways to measure the\
+  \ Central Tendency: the Mean, the Median and the Mode. From your knowledge about\
+  \ them, what is the mode?\n(A) less sensitive to extreme scores than the mean (B)\
+  \ more useful for skewed distributions (C) sensitive to extreme values and highly\
+  \ skewed distributions (D) the most frequently occurring number\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on psychology for help. The definition\
+  \ of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl\
+  \ Jung believed that a client's transference:\n(A) is a fantasy that distracts the\
+  \ client from reality. (B) represents \u201Cmixed feelings\u201D toward the therapist.\
+  \ (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal\
+  \ and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on psychology for help. Transference is a phenomenon that a person's\
+  \ feelings are unconsciously redirected, so it reflects the client's personal and\
+  \ collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980)\
+  \ five cultural dimensions, the United States scores at the top of the scale on:\n\
+  (A) individualism. (B) individualism and power distance. (C) power distance and\
+  \ masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on psychology for help. US scores highest on individualism\
+  \ among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy\
+  \ clients asks your advice about a good weight- reduction program. You have investigated\
+  \ the programs in the community and are enrolled in the one you consider the best.\
+  \ This program offers a $50 bonus to its patrons for each new person they bring\
+  \ into the program. Under these circumstances, your most appropriate response would\
+  \ be to\n(A) tell your client the pros and cons of each program you know about except\
+  \ for the one in which you are enrolled (B) recommend to your client the program\
+  \ in which you are enrolled and explain the $50 bonus you will receive (C) recommend\
+  \ to your client the program in which you are enrolled and offer to have the $50\
+  \ bonus credited to your client's account in the program (D) tell your client the\
+  \ pros and cons of each program you know about, but do not claim the $50 bonus if\
+  \ your client enrolls in your program\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on psychology for help. Based on the circumstances, you should\
+  \ tell your client about the pros and cons of each program, but it would be inappropriate\
+  \ to receive the bonus, so you should not claim the $50 bonus. The answer is (D)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
new file mode 100644
index 00000000..be4edf98
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
@@ -0,0 +1,65 @@
+dataset_name: public_relations
+description: 'The following are multiple choice questions (with answers) about public
+  relations.
+
+
+  Q: Earth Hour was a campaign launched by which organization?
+
+  (A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund
+
+  A: Let''s think step by step. We refer to Wikipedia articles on public relations
+  for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife
+  Fund. The answer is (D).
+
+
+  Q: In issues management, what is the most proactive approach to addressing negative
+  or misleading information posted online about your organization?
+
+  (A) Buy domain names that could be used by opposition groups. (B) Post anonymous
+  comments on blogs to combat this information. (C) Prepare a news release that discredits
+  the inaccurate information. (D) Make policy changes to address complaints highlighted
+  on these sites.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on public relations
+  for help. In issues management, the most proactive approach to addressing negative
+  or misleading information posted online is to make policy changes to address complaints
+  highlighted on those sites. The answer is (D).
+
+
+  Q: At which stage in the planning process would a situation analysis be carried
+  out?
+
+  (A) Defining the program (B) Planning the program (C) Taking action and implementing
+  ideas (D) Evaluation of the program
+
+  A: Let''s think step by step. We refer to Wikipedia articles on public relations
+  for help. Situation analyses are typically carried out during the planning process
+  stage of defining the program. The answer is (A).
+
+
+  Q: Which of these statements is true of the Vatican in 2010 at the time of the accusations
+  of child abuse cover-ups?
+
+  (A) There was a coordinated media response. (B) Consistent messages were communicated.
+  (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility
+  of the Vatican was upheld.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on public relations
+  for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican
+  took those criticisms as attacks on the Catholic Church. The answer is (C).
+
+
+  Q: What should a public relations media practitioner do if she does not know the
+  answer to a reporter''s question?
+
+  (A) Give the reporter other information she is certain is correct. (B) Say that
+  the information is ''off the record'' and will be disseminated later. (C) Say ''I
+  don''t know'' and promise to provide the information later. (D) Say ''no comment,''
+  rather than appear uninformed.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on public relations
+  for help. If a public relations media practitioner does not know the answer to a
+  reporter''s question, they should say ''I don''t know'' and offer to provide the
+  information later. The answer is (C).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
new file mode 100644
index 00000000..b08c321a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
@@ -0,0 +1,85 @@
+dataset_name: security_studies
+description: "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\nQ: What are the frameworks of analysis within which terrorism has\
+  \ been considered (as of 2020)?\n(A) Competition between larger nations has resulted\
+  \ in some countries actively supporting terrorist groups to undermine the strength\
+  \ of rival states. Terrorist networks are extended patronage clubs maintained and\
+  \ paid for by their donor states and are conceptualised as being like state actors,\
+  \ to be dealt with using military force. (B) Globalization has enabled the internationalization\
+  \ of terrorist activities by opening up their operational space, although coordination\
+  \ is still managed from a geographical base. This suggests that terrorist groups\
+  \ are nationally structured which means that terrorism cannot be considered in terms\
+  \ of a war to be defeated militarily without having serious implications on the\
+  \ indigenous population. (C) Terrorism can be viewed as a problem to be resolved\
+  \ by military means (war on terrorism), by normal police techniques (terrorism as\
+  \ crime), or as a medical problem with underlying causes and symptoms (terrorism\
+  \ as disease). (D) Terrorism is viewed as a criminal problem. The criminalization\
+  \ of terrorism has two important implications. Firstly, it suggests that terrorism\
+  \ can be eradicated - terrorists can be caught and brought to trial by normal judicial\
+  \ proceedings thereby removing the threat from society - and secondly, it suggests\
+  \ that preventative crime techniques are applicable to prevent its development.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on security studies\
+  \ for help. (A) is wrong because it is not competition between larger nations that\
+  \ causes terrorism. \n(B) is wrong because globalization is not the cause of terrorism.\n\
+  (C) is correct because the US undertook the war on terrorism. \n(D) is wrong because\
+  \ preventative crime techniques will likely not end terrorism. The answer is (C).\n\
+  \nQ: Which of the following is the best lens through which to investigate the role\
+  \ of child soldiers?\n(A) Child soldiers are victims of combat that need re-education\
+  \ and rehabilitation. (B) Children and their mothers are not active subjects in\
+  \ warfare and are best considered as subjects in the private sphere. (C) Children\
+  \ are most often innocent bystanders in war and are best used as signifiers of peace.\
+  \ (D) Children have political subjecthood that is missed when they are considered\
+  \ as passive victims of warfare.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on security studies for help. Child soliders as a political topic can\
+  \ be missed when they are considered passive victims of warfare. The answer is (D).\n\
+  \nQ: How can we best describe the relationship between the state-centric approach\
+  \ and the concept of human security?\n(A) There are such wide divisions within the\
+  \ human security framework regarding the nature of threats and referent objects\
+  \ that no widely applicable comparisons between state-centric approaches and human\
+  \ security can be drawn. (B) By adopting the framework of human security, the limitations\
+  \ of the realist state-centric approach become evident. Whilst human security defines\
+  \ the referent object as the person or population, state-centric approaches prioritise\
+  \ the security of the state, de-prioritizing the pursuit of human security. (C)\
+  \ The state-centric approach to security is a faction of human security, usually\
+  \ defined within the broad school of human security. By being state-centric this\
+  \ approach prioritises the individual as the referent object in security studies.\
+  \ (D) Both the state-centric and human-centric approaches to security are mutually\
+  \ exclusive and offer a sufficient analytic framework with which to understand the\
+  \ international security system. It is therefore the role of security analysts to\
+  \ determine which of these substantial concepts is correct, and which should be\
+  \ discarded.\nA: Let's think step by step. We refer to Wikipedia articles on security\
+  \ studies for help. Human security focuses on a person or population whereas state-centric\
+  \ approaches focus on the state while deprioritizing human security. The answer\
+  \ is (B).\n\nQ: In order to become securitized, a threat must be presented in which\
+  \ of these ways?\n(A) As an existential threat that requires immediate and extraordinary\
+  \ action, posing a threat to the survival of the state or to societal security.\
+  \ (B) As requiring immediate and extraordinary action by the state, threatening\
+  \ the survival of a referent object and therefore warranting the use of measures\
+  \ not normally employed in the political realm. (C) As an urgent threat to the survival\
+  \ of the referent object, so serious that it legitimises the employment of extraordinary\
+  \ action in response. (D) As an urgent threat to the survival of the audience that\
+  \ requires extraordinary or emergency measures.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on security studies for help. To be securitized, a\
+  \ threat must be an urgent threat to the survival of the referent object. The answer\
+  \ is (C).\n\nQ: What distinguishes coercive diplomacy from military force?\n(A)\
+  \ Compellence is another term for coercive diplomacy, but covering a narrower set\
+  \ of criteria; compellence covers those threats aimed at initiating adversary action.\
+  \ A threat to coerce a state to give up part of its territory would count as coercive\
+  \ diplomacy, as long as that threat proactively initiates action before reactive\
+  \ diplomacy is taken. (B) Coercive diplomacy constitutes the threats of limited\
+  \ force to induce adversary's incentive to comply with the coercer's demands. It\
+  \ is an influence strategy that is intended to obtain compliance: the use of force\
+  \ to defeat an opponent first does not count. It leaves an element of choice with\
+  \ the target to comply, or to continue. (C) Military force, or the threat of military\
+  \ force, utilises fear to achieve strategic objectives. Coercive diplomacy is differentiated\
+  \ from this approach, because it does not use fear as a tool for coercing an adversary.\
+  \ (D) Coercive diplomacy is employed to use force but to limit its effects on the\
+  \ international community. Coercive diplomacy is an aggressive strategy that is\
+  \ intended to obtain compliance through defeat. It does not leave an element of\
+  \ choice with the target, the target either being forced to comply or engage in\
+  \ conflict. It seeks to control by imposing compliance by removing any opportunity\
+  \ for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on security studies for help. Coercive diplomacy uses the threat of force\
+  \ to induce the opponent to comply with demands. The answer is (B)."
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
new file mode 100644
index 00000000..38974b00
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
@@ -0,0 +1,67 @@
+dataset_name: sociology
+description: 'The following are multiple choice questions (with answers) about sociology.
+
+
+  Q: Which of the following is not a problem associated with official statistics on
+  strike action?
+
+  (A) most strikes go unnoticed by employers and the mass media (B) not all industrial
+  disputes will be reported by the employer (C) the definition of strikes excludes
+  those that involve fewer than ten workers or last less than one day (D) it is hard
+  to compare strikes that were measured in different ways
+
+  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
+  Official statistics on strike action can be problematic because not all industrial
+  disputes will be reported by employers, the definition of strikes excludes those
+  that involves fewer than ten workers or last less than one day, and it is hard to
+  compare strikes that were measured in different ways. Thus, (A) is not a problem
+  associated with official statistics on strike action. The answer is (A).
+
+
+  Q: What does Berger (1963) describe as a metaphor for social reality?
+
+  (A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet
+
+  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
+  Berger describes social reality using the metaphor of a puppet theatre. The answer
+  is (C).
+
+
+  Q: The term ''hegemony'' refers to:
+
+  (A) the tendency for the working class not to realize their own interests (B) a
+  dominant ideology that legitimates economic, political and cultural power (C) a
+  form of dual consciousness based on ideology and everyday experiences (D) a mode
+  of payment given for outstanding topiary
+
+  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
+  Hegemony refers to a dominant ideology that legitimates economic, policital, and
+  cultural power. The answer is (B).
+
+
+  Q: The shift from ''civil religion'' to ''common religion'' means that:
+
+  (A) the increasing bureaucracy of the state has made religion only a marginal part
+  of our lives (B) despite the weakening of traditional authority, our everyday lives
+  and ''common sense'' remain shaped by religious beliefs and values (C) religious
+  participation in collective worship may have declined, but people still practise
+  their faiths in private (D) people are much more likely to discuss their religious
+  beliefs in public, informal settings
+
+  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
+  The shift from civil religion to common religion means that despite the weakening
+  of traditional authority, our everyday lives and common sense remain shaped by religious
+  beliefs and values. The answer is (B).
+
+
+  Q: Which of the following did the post-war welfare state of 1948 not aim to provide:
+
+  (A) free health care and education for all (B) a minimum wage (C) full employment
+  (D) universal welfare
+
+  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
+  The post-war welfare state of 1948 aimed to provide free healthcare and education,
+  full employment, and universal welfare. But it did not aim to provide a minimum
+  wage. The answer is (B).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_sociology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
new file mode 100644
index 00000000..6340aee3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,66 @@
+dataset_name: us_foreign_policy
+description: 'The following are multiple choice questions (with answers) about us
+  foreign policy.
+
+
+  Q: How did Donald Trump attack globalization in the 2016 campaign?
+
+  (A) Globalization had made men like him too rich (B) Globalization only benefited
+  certain American states, such as New York (C) Liberal elites had encouraged globalization,
+  while ''ordinary Americans'' lost jobs because of it (D) Globalization encouraged
+  damaging trade wars
+
+  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
+  for help. Trump attacked globalization because he believed ordinary Americans lost
+  jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer
+  is (C).
+
+
+  Q: How did NSC-68 change U.S. strategy?
+
+  (A) It globalized containment. (B) It militarized containment. (C) It called for
+  the development of the hydrogen bomb. (D) All of the above
+
+  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
+  for help. NSC-68 outlined a variety of courses of action, including globalization
+  of containment, militarization of contaiment, and the development of the hydrogen
+  bomb. The answer is (D).
+
+
+  Q: How do Defensive Realism and Offensive Realism differ in their explanation of
+  state behaviour?
+
+  (A) Defensive realists place greater emphasis on the role of international institutions
+  (B) Defensive realists place less emphasis on geographical factors (C) Offensive
+  realists give more priority to the national interest than Defensive realists. (D)
+  Defensive realists believe states are security maximizers, while Offensive realists
+  believe states to be power maximizers
+
+  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
+  for help. While defensive realism advocates that states are security maximizers,
+  offensive realists think of states as power maximizers. The answer is (D).
+
+
+  Q: The realm of policy decisions concerned primarily with relations between the
+  United States and the rest of the world is known as
+
+  (A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international
+  policy.
+
+  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
+  for help. The topic of policy decisions concerns with relations between the US and
+  the rest of the world is known as foreign policy. The answer is (C).
+
+
+  Q: How did the 2008 financial crisis affect America''s international reputation?
+
+  (A) It damaged support for the US model of political economy and capitalism (B)
+  It created anger at the United States for exaggerating the crisis (C) It increased
+  support for American global leadership under President Obama (D) It reduced global
+  use of the US dollar
+
+  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
+  for help. The 2008 financial crisis damanged the international reputation of the
+  American model of political economy and capitalism. The answer is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
new file mode 100644
index 00000000..5bbd7a2c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
@@ -0,0 +1,55 @@
+dataset_name: virology
+description: 'The following are multiple choice questions (with answers) about virology.
+
+
+  Q: The median survival time to AIDS and death was established by following:
+
+  (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters
+  (D) High-risk seronegatives
+
+  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
+  The median survival time to AIDS and death was established as a result of the development
+  of seroconverters. The answer is (C).
+
+
+  Q: Which of the following is a morphological characteristic of the paramyxoviruses.
+
+  (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate
+  viruses (C) Icosahedral viruses with envelope (D) Very large viruses
+
+  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
+  Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside.
+  The answer is (A).
+
+
+  Q: The most important goal of a behavioral intervention is:
+
+  (A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral
+  theory (D) Sustained behavior change
+
+  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
+  The prim goal of a behavioral intervention is to cause sustained behavior change.
+  The answer is (D).
+
+
+  Q: A key factor facilitating the application of nested case-control studies from
+  the MACS was:
+
+  (A) Data collection (B) Establishment of a repository of biologic specimens (C)
+  Participant interest (D) Administration of the questionnaire by staff
+
+  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
+  The Multicenter AIDS Cohort Study''s use of nested case-control studies was facilitated
+  by the establishment of a repository of biologic specimens. The answer is (B).
+
+
+  Q: Why are parvoviruses a highly impactful parasite?
+
+  (A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate
+  in dividing cells (D) Can integrate into host chromosomes
+
+  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
+  Paroviruses are highly impactful because they do not have nucleic acid. The answer
+  is (A).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_virology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
new file mode 100644
index 00000000..c01adcdb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
@@ -0,0 +1,53 @@
+dataset_name: world_religions
+description: 'The following are multiple choice questions (with answers) about world
+  religions.
+
+
+  Q: How can the Upanishads be characterized?
+
+  (A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories
+
+  A: Let''s think step by step. We refer to Wikipedia articles on world religions
+  for help. The Upanishads are the most recent part of Vedas (the oldest scriptures
+  in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical
+  texts. The answer is (B).
+
+
+  Q: What is the Second Gem in Buddhism?
+
+  (A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva
+
+  A: Let''s think step by step. We refer to Wikipedia articles on world religions
+  for help. The Second Gem in Buddhism is The Dharma. The answer is (A).
+
+
+  Q: Which Japanese government promoted a kind of national cult based on the emperor
+  and his associations with kami?
+
+  (A) Honen (B) Tanaka (C) Tokugawa (D) Meiji
+
+  A: Let''s think step by step. We refer to Wikipedia articles on world religions
+  for help. The promotion of a national cult based on the emperor and his associations
+  with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is
+  (D).
+
+
+  Q: In which dynasty was the "Mandate of Heaven" developed to legitimatize the new
+  rulers?
+
+  (A) Shang (B) Zhou (C) Han (D) Xia
+
+  A: Let''s think step by step. We refer to Wikipedia articles on world religions
+  for help. The "Mandate of Heaven" was developed as an ancient Chinese philosophical
+  concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).
+
+
+  Q: What is the sign of the covenant for Jewish males?
+
+  (A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah
+
+  A: Let''s think step by step. We refer to Wikipedia articles on world religions
+  for help. In Judaism, the most distinctive sign of the covenant is circumcision
+  (brit milah). The answer is (B).'
+include: _mmlu_flan_cot_fewshot_template_yaml
+task: mmlu_flan_cot_fewshot_world_religions
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
new file mode 100644
index 00000000..e5b8e429
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -0,0 +1,25 @@
+group: mmlu_flan_cot_zeroshot
+dataset_path: cais/mmlu
+validation_split: validation
+fewshot_split: dev
+doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
+output_type: greedy_until
+fewshot_delimiter: ""
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(.)"
+      - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
new file mode 100644
index 00000000..17bccf1f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
@@ -0,0 +1,8 @@
+dataset_name: abstract_algebra
+description: 'The following are multiple choice questions (with answers) about abstract
+  algebra.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
new file mode 100644
index 00000000..6e14fbc6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: anatomy
+description: 'The following are multiple choice questions (with answers) about anatomy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
new file mode 100644
index 00000000..b1ca9f52
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: astronomy
+description: 'The following are multiple choice questions (with answers) about astronomy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
new file mode 100644
index 00000000..53f3a78f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
@@ -0,0 +1,8 @@
+dataset_name: business_ethics
+description: 'The following are multiple choice questions (with answers) about business
+  ethics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
new file mode 100644
index 00000000..f858d671
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,8 @@
+dataset_name: clinical_knowledge
+description: 'The following are multiple choice questions (with answers) about clinical
+  knowledge.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
new file mode 100644
index 00000000..93471b6a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_biology
+description: 'The following are multiple choice questions (with answers) about college
+  biology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
new file mode 100644
index 00000000..5f619baa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_chemistry
+description: 'The following are multiple choice questions (with answers) about college
+  chemistry.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
new file mode 100644
index 00000000..865b91bf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_computer_science
+description: 'The following are multiple choice questions (with answers) about college
+  computer science.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
new file mode 100644
index 00000000..1f8a89fa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_mathematics
+description: 'The following are multiple choice questions (with answers) about college
+  mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
new file mode 100644
index 00000000..e852c64b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_medicine
+description: 'The following are multiple choice questions (with answers) about college
+  medicine.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
new file mode 100644
index 00000000..f215c2f0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_physics
+description: 'The following are multiple choice questions (with answers) about college
+  physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
new file mode 100644
index 00000000..402f7bdc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
@@ -0,0 +1,8 @@
+dataset_name: computer_security
+description: 'The following are multiple choice questions (with answers) about computer
+  security.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
new file mode 100644
index 00000000..c3ad6376
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: conceptual_physics
+description: 'The following are multiple choice questions (with answers) about conceptual
+  physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
new file mode 100644
index 00000000..dad5a83b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
@@ -0,0 +1,7 @@
+dataset_name: econometrics
+description: 'The following are multiple choice questions (with answers) about econometrics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
new file mode 100644
index 00000000..72a08dca
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
@@ -0,0 +1,8 @@
+dataset_name: electrical_engineering
+description: 'The following are multiple choice questions (with answers) about electrical
+  engineering.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
new file mode 100644
index 00000000..0531f23e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: elementary_mathematics
+description: 'The following are multiple choice questions (with answers) about elementary
+  mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
new file mode 100644
index 00000000..80b26401
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
@@ -0,0 +1,8 @@
+dataset_name: formal_logic
+description: 'The following are multiple choice questions (with answers) about formal
+  logic.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
new file mode 100644
index 00000000..491d0db4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
@@ -0,0 +1,8 @@
+dataset_name: global_facts
+description: 'The following are multiple choice questions (with answers) about global
+  facts.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
new file mode 100644
index 00000000..32da2e26
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_biology
+description: 'The following are multiple choice questions (with answers) about high
+  school biology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
new file mode 100644
index 00000000..5968e54e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_chemistry
+description: 'The following are multiple choice questions (with answers) about high
+  school chemistry.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
new file mode 100644
index 00000000..2666de90
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_computer_science
+description: 'The following are multiple choice questions (with answers) about high
+  school computer science.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
new file mode 100644
index 00000000..fb59ada4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_european_history
+description: 'The following are multiple choice questions (with answers) about high
+  school european history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
new file mode 100644
index 00000000..ed3fca55
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_geography
+description: 'The following are multiple choice questions (with answers) about high
+  school geography.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..62803b4b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_government_and_politics
+description: 'The following are multiple choice questions (with answers) about high
+  school government and politics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..f973b58d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_macroeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school macroeconomics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
new file mode 100644
index 00000000..550dfcf1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_mathematics
+description: 'The following are multiple choice questions (with answers) about high
+  school mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 00000000..8a1e4c4c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_microeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school microeconomics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
new file mode 100644
index 00000000..4997e712
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_physics
+description: 'The following are multiple choice questions (with answers) about high
+  school physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
new file mode 100644
index 00000000..a3e801ca
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_psychology
+description: 'The following are multiple choice questions (with answers) about high
+  school psychology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
new file mode 100644
index 00000000..d057cbef
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_statistics
+description: 'The following are multiple choice questions (with answers) about high
+  school statistics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
new file mode 100644
index 00000000..583d9591
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_us_history
+description: 'The following are multiple choice questions (with answers) about high
+  school us history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
new file mode 100644
index 00000000..40445582
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_world_history
+description: 'The following are multiple choice questions (with answers) about high
+  school world history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
new file mode 100644
index 00000000..c6db4c1c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_aging
+description: 'The following are multiple choice questions (with answers) about human
+  aging.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
new file mode 100644
index 00000000..41795660
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_sexuality
+description: 'The following are multiple choice questions (with answers) about human
+  sexuality.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
new file mode 100644
index 00000000..da1273b0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: international_law
+description: 'The following are multiple choice questions (with answers) about international
+  law.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_international_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
new file mode 100644
index 00000000..e1a6a28b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
@@ -0,0 +1,7 @@
+dataset_name: jurisprudence
+description: 'The following are multiple choice questions (with answers) about jurisprudence.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
new file mode 100644
index 00000000..e94cde17
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
@@ -0,0 +1,8 @@
+dataset_name: logical_fallacies
+description: 'The following are multiple choice questions (with answers) about logical
+  fallacies.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
new file mode 100644
index 00000000..a17387bd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
@@ -0,0 +1,8 @@
+dataset_name: machine_learning
+description: 'The following are multiple choice questions (with answers) about machine
+  learning.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
new file mode 100644
index 00000000..68fc6ba2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
@@ -0,0 +1,7 @@
+dataset_name: management
+description: 'The following are multiple choice questions (with answers) about management.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_management
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
new file mode 100644
index 00000000..f6c6444c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
@@ -0,0 +1,7 @@
+dataset_name: marketing
+description: 'The following are multiple choice questions (with answers) about marketing.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_marketing
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
new file mode 100644
index 00000000..2490826b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
@@ -0,0 +1,8 @@
+dataset_name: medical_genetics
+description: 'The following are multiple choice questions (with answers) about medical
+  genetics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
new file mode 100644
index 00000000..5aebaef8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
@@ -0,0 +1,7 @@
+dataset_name: miscellaneous
+description: 'The following are multiple choice questions (with answers) about miscellaneous.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
new file mode 100644
index 00000000..85829454
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_disputes
+description: 'The following are multiple choice questions (with answers) about moral
+  disputes.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
new file mode 100644
index 00000000..f8a31ddc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_scenarios
+description: 'The following are multiple choice questions (with answers) about moral
+  scenarios.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
new file mode 100644
index 00000000..238c3f1c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
@@ -0,0 +1,7 @@
+dataset_name: nutrition
+description: 'The following are multiple choice questions (with answers) about nutrition.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
new file mode 100644
index 00000000..c4a8fb47
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
@@ -0,0 +1,7 @@
+dataset_name: philosophy
+description: 'The following are multiple choice questions (with answers) about philosophy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
new file mode 100644
index 00000000..07f31813
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
@@ -0,0 +1,7 @@
+dataset_name: prehistory
+description: 'The following are multiple choice questions (with answers) about prehistory.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
new file mode 100644
index 00000000..82b5ff2c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_accounting
+description: 'The following are multiple choice questions (with answers) about professional
+  accounting.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
new file mode 100644
index 00000000..32210b49
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_law
+description: 'The following are multiple choice questions (with answers) about professional
+  law.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
new file mode 100644
index 00000000..ed9eebe1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_medicine
+description: 'The following are multiple choice questions (with answers) about professional
+  medicine.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
new file mode 100644
index 00000000..7110b840
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_psychology
+description: 'The following are multiple choice questions (with answers) about professional
+  psychology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
new file mode 100644
index 00000000..5138cdd8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
@@ -0,0 +1,8 @@
+dataset_name: public_relations
+description: 'The following are multiple choice questions (with answers) about public
+  relations.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
new file mode 100644
index 00000000..84c359d7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
@@ -0,0 +1,8 @@
+dataset_name: security_studies
+description: 'The following are multiple choice questions (with answers) about security
+  studies.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
new file mode 100644
index 00000000..fed1dc49
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
@@ -0,0 +1,7 @@
+dataset_name: sociology
+description: 'The following are multiple choice questions (with answers) about sociology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_sociology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
new file mode 100644
index 00000000..d94f60e9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,8 @@
+dataset_name: us_foreign_policy
+description: 'The following are multiple choice questions (with answers) about us
+  foreign policy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
new file mode 100644
index 00000000..feaa8b06
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
@@ -0,0 +1,7 @@
+dataset_name: virology
+description: 'The following are multiple choice questions (with answers) about virology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_virology
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
new file mode 100644
index 00000000..fe2b4c42
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
@@ -0,0 +1,8 @@
+dataset_name: world_religions
+description: 'The following are multiple choice questions (with answers) about world
+  religions.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_cot_zeroshot_world_religions
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
new file mode 100644
index 00000000..b369024c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -0,0 +1,18 @@
+group: mmlu_flan
+dataset_path: cais/mmlu
+validation_split: validation
+fewshot_split: dev
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
+output_type: greedy_until
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
new file mode 100644
index 00000000..eb38e0fe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -0,0 +1,12 @@
+group: mmlu_flan_loglikelihood
+dataset_path: cais/mmlu
+validation_split: validation
+fewshot_split: dev
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
+output_type: multiple_choice
+doc_to_choice: ['(A)', '(B)', '(C)', '(D)']
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
new file mode 100644
index 00000000..9fca2117
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
@@ -0,0 +1,8 @@
+dataset_name: abstract_algebra
+description: 'The following are multiple choice questions (with answers) about abstract
+  algebra.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
new file mode 100644
index 00000000..e8978402
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: anatomy
+description: 'The following are multiple choice questions (with answers) about anatomy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
new file mode 100644
index 00000000..66902758
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: astronomy
+description: 'The following are multiple choice questions (with answers) about astronomy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
new file mode 100644
index 00000000..f75a48fd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
@@ -0,0 +1,8 @@
+dataset_name: business_ethics
+description: 'The following are multiple choice questions (with answers) about business
+  ethics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
new file mode 100644
index 00000000..07a3fe79
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,8 @@
+dataset_name: clinical_knowledge
+description: 'The following are multiple choice questions (with answers) about clinical
+  knowledge.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
new file mode 100644
index 00000000..7465f0d3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_biology
+description: 'The following are multiple choice questions (with answers) about college
+  biology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
new file mode 100644
index 00000000..17e7fbde
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_chemistry
+description: 'The following are multiple choice questions (with answers) about college
+  chemistry.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
new file mode 100644
index 00000000..d0032874
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_computer_science
+description: 'The following are multiple choice questions (with answers) about college
+  computer science.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
new file mode 100644
index 00000000..be1e01b2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_mathematics
+description: 'The following are multiple choice questions (with answers) about college
+  mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
new file mode 100644
index 00000000..4c8aa79e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_medicine
+description: 'The following are multiple choice questions (with answers) about college
+  medicine.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
new file mode 100644
index 00000000..cd07980b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_physics
+description: 'The following are multiple choice questions (with answers) about college
+  physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
new file mode 100644
index 00000000..93dc9040
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
@@ -0,0 +1,8 @@
+dataset_name: computer_security
+description: 'The following are multiple choice questions (with answers) about computer
+  security.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
new file mode 100644
index 00000000..2f313298
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: conceptual_physics
+description: 'The following are multiple choice questions (with answers) about conceptual
+  physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
new file mode 100644
index 00000000..b46c90cb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
@@ -0,0 +1,7 @@
+dataset_name: econometrics
+description: 'The following are multiple choice questions (with answers) about econometrics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
new file mode 100644
index 00000000..5d1ccca1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
@@ -0,0 +1,8 @@
+dataset_name: electrical_engineering
+description: 'The following are multiple choice questions (with answers) about electrical
+  engineering.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
new file mode 100644
index 00000000..7260b752
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: elementary_mathematics
+description: 'The following are multiple choice questions (with answers) about elementary
+  mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
new file mode 100644
index 00000000..7dd42af5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
@@ -0,0 +1,8 @@
+dataset_name: formal_logic
+description: 'The following are multiple choice questions (with answers) about formal
+  logic.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
new file mode 100644
index 00000000..373f99fb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
@@ -0,0 +1,8 @@
+dataset_name: global_facts
+description: 'The following are multiple choice questions (with answers) about global
+  facts.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
new file mode 100644
index 00000000..334286dc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_biology
+description: 'The following are multiple choice questions (with answers) about high
+  school biology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
new file mode 100644
index 00000000..f4d3bcfb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_chemistry
+description: 'The following are multiple choice questions (with answers) about high
+  school chemistry.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
new file mode 100644
index 00000000..03e0411b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_computer_science
+description: 'The following are multiple choice questions (with answers) about high
+  school computer science.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
new file mode 100644
index 00000000..a7a73a24
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_european_history
+description: 'The following are multiple choice questions (with answers) about high
+  school european history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
new file mode 100644
index 00000000..d7b29960
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_geography
+description: 'The following are multiple choice questions (with answers) about high
+  school geography.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..dbb195e2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_government_and_politics
+description: 'The following are multiple choice questions (with answers) about high
+  school government and politics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..a8e6c4cd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_macroeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school macroeconomics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
new file mode 100644
index 00000000..35adc8b7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_mathematics
+description: 'The following are multiple choice questions (with answers) about high
+  school mathematics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 00000000..a0887261
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_microeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school microeconomics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
new file mode 100644
index 00000000..63a77a7d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_physics
+description: 'The following are multiple choice questions (with answers) about high
+  school physics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
new file mode 100644
index 00000000..7d738494
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_psychology
+description: 'The following are multiple choice questions (with answers) about high
+  school psychology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
new file mode 100644
index 00000000..6b3e4b5a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_statistics
+description: 'The following are multiple choice questions (with answers) about high
+  school statistics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
new file mode 100644
index 00000000..e80b64e1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_us_history
+description: 'The following are multiple choice questions (with answers) about high
+  school us history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
new file mode 100644
index 00000000..83d55ae6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_world_history
+description: 'The following are multiple choice questions (with answers) about high
+  school world history.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
new file mode 100644
index 00000000..2ff416b8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_aging
+description: 'The following are multiple choice questions (with answers) about human
+  aging.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
new file mode 100644
index 00000000..2d7316ac
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_sexuality
+description: 'The following are multiple choice questions (with answers) about human
+  sexuality.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
new file mode 100644
index 00000000..547cb3c0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: international_law
+description: 'The following are multiple choice questions (with answers) about international
+  law.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_international_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
new file mode 100644
index 00000000..51613f16
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
@@ -0,0 +1,7 @@
+dataset_name: jurisprudence
+description: 'The following are multiple choice questions (with answers) about jurisprudence.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
new file mode 100644
index 00000000..1b8a4d2b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
@@ -0,0 +1,8 @@
+dataset_name: logical_fallacies
+description: 'The following are multiple choice questions (with answers) about logical
+  fallacies.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
new file mode 100644
index 00000000..60973953
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
@@ -0,0 +1,8 @@
+dataset_name: abstract_algebra
+description: 'The following are multiple choice questions (with answers) about abstract
+  algebra.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
new file mode 100644
index 00000000..ff927e05
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: anatomy
+description: 'The following are multiple choice questions (with answers) about anatomy.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
new file mode 100644
index 00000000..95329c44
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
@@ -0,0 +1,7 @@
+dataset_name: astronomy
+description: 'The following are multiple choice questions (with answers) about astronomy.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
new file mode 100644
index 00000000..3f2bcc77
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
@@ -0,0 +1,8 @@
+dataset_name: business_ethics
+description: 'The following are multiple choice questions (with answers) about business
+  ethics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
new file mode 100644
index 00000000..780c8bf6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
@@ -0,0 +1,8 @@
+dataset_name: clinical_knowledge
+description: 'The following are multiple choice questions (with answers) about clinical
+  knowledge.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
new file mode 100644
index 00000000..7d270f47
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_biology
+description: 'The following are multiple choice questions (with answers) about college
+  biology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
new file mode 100644
index 00000000..e947d1a2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_chemistry
+description: 'The following are multiple choice questions (with answers) about college
+  chemistry.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
new file mode 100644
index 00000000..a23b0bd3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_computer_science
+description: 'The following are multiple choice questions (with answers) about college
+  computer science.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
new file mode 100644
index 00000000..c03033eb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_mathematics
+description: 'The following are multiple choice questions (with answers) about college
+  mathematics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
new file mode 100644
index 00000000..64f952bb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_medicine
+description: 'The following are multiple choice questions (with answers) about college
+  medicine.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
new file mode 100644
index 00000000..f339c316
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: college_physics
+description: 'The following are multiple choice questions (with answers) about college
+  physics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
new file mode 100644
index 00000000..cc28f843
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
@@ -0,0 +1,8 @@
+dataset_name: computer_security
+description: 'The following are multiple choice questions (with answers) about computer
+  security.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
new file mode 100644
index 00000000..dc3c9096
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: conceptual_physics
+description: 'The following are multiple choice questions (with answers) about conceptual
+  physics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
new file mode 100644
index 00000000..034c0e63
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
@@ -0,0 +1,7 @@
+dataset_name: econometrics
+description: 'The following are multiple choice questions (with answers) about econometrics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
new file mode 100644
index 00000000..20823b42
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
@@ -0,0 +1,8 @@
+dataset_name: electrical_engineering
+description: 'The following are multiple choice questions (with answers) about electrical
+  engineering.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
new file mode 100644
index 00000000..afed59aa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: elementary_mathematics
+description: 'The following are multiple choice questions (with answers) about elementary
+  mathematics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
new file mode 100644
index 00000000..2a2359f5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
@@ -0,0 +1,8 @@
+dataset_name: formal_logic
+description: 'The following are multiple choice questions (with answers) about formal
+  logic.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
new file mode 100644
index 00000000..4d23b227
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
@@ -0,0 +1,8 @@
+dataset_name: global_facts
+description: 'The following are multiple choice questions (with answers) about global
+  facts.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
new file mode 100644
index 00000000..a9bdefee
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_biology
+description: 'The following are multiple choice questions (with answers) about high
+  school biology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
new file mode 100644
index 00000000..ec512f42
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_chemistry
+description: 'The following are multiple choice questions (with answers) about high
+  school chemistry.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
new file mode 100644
index 00000000..67d70ec6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_computer_science
+description: 'The following are multiple choice questions (with answers) about high
+  school computer science.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
new file mode 100644
index 00000000..62c6013b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_european_history
+description: 'The following are multiple choice questions (with answers) about high
+  school european history.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
new file mode 100644
index 00000000..a4b6d856
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_geography
+description: 'The following are multiple choice questions (with answers) about high
+  school geography.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..f7c2cb8c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_government_and_politics
+description: 'The following are multiple choice questions (with answers) about high
+  school government and politics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..b623360e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_macroeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school macroeconomics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
new file mode 100644
index 00000000..b2ddfc0e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_mathematics
+description: 'The following are multiple choice questions (with answers) about high
+  school mathematics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
new file mode 100644
index 00000000..e8bff48d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_microeconomics
+description: 'The following are multiple choice questions (with answers) about high
+  school microeconomics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
new file mode 100644
index 00000000..2b97ac1d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_physics
+description: 'The following are multiple choice questions (with answers) about high
+  school physics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
new file mode 100644
index 00000000..fecb5f70
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_psychology
+description: 'The following are multiple choice questions (with answers) about high
+  school psychology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
new file mode 100644
index 00000000..4ff766db
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_statistics
+description: 'The following are multiple choice questions (with answers) about high
+  school statistics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
new file mode 100644
index 00000000..f725e916
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_us_history
+description: 'The following are multiple choice questions (with answers) about high
+  school us history.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
new file mode 100644
index 00000000..0142ce33
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
@@ -0,0 +1,8 @@
+dataset_name: high_school_world_history
+description: 'The following are multiple choice questions (with answers) about high
+  school world history.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
new file mode 100644
index 00000000..4b007ca1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_aging
+description: 'The following are multiple choice questions (with answers) about human
+  aging.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
new file mode 100644
index 00000000..37d5e42a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
@@ -0,0 +1,8 @@
+dataset_name: human_sexuality
+description: 'The following are multiple choice questions (with answers) about human
+  sexuality.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
new file mode 100644
index 00000000..03987fdf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: international_law
+description: 'The following are multiple choice questions (with answers) about international
+  law.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_international_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
new file mode 100644
index 00000000..a95f42ed
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
@@ -0,0 +1,7 @@
+dataset_name: jurisprudence
+description: 'The following are multiple choice questions (with answers) about jurisprudence.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
new file mode 100644
index 00000000..3b8b7b98
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
@@ -0,0 +1,8 @@
+dataset_name: logical_fallacies
+description: 'The following are multiple choice questions (with answers) about logical
+  fallacies.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
new file mode 100644
index 00000000..473a2bc9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
@@ -0,0 +1,8 @@
+dataset_name: machine_learning
+description: 'The following are multiple choice questions (with answers) about machine
+  learning.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
new file mode 100644
index 00000000..70eb8768
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
@@ -0,0 +1,7 @@
+dataset_name: management
+description: 'The following are multiple choice questions (with answers) about management.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_management
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
new file mode 100644
index 00000000..48c03524
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
@@ -0,0 +1,7 @@
+dataset_name: marketing
+description: 'The following are multiple choice questions (with answers) about marketing.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_marketing
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
new file mode 100644
index 00000000..ef221495
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
@@ -0,0 +1,8 @@
+dataset_name: medical_genetics
+description: 'The following are multiple choice questions (with answers) about medical
+  genetics.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
new file mode 100644
index 00000000..bde2352b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
@@ -0,0 +1,7 @@
+dataset_name: miscellaneous
+description: 'The following are multiple choice questions (with answers) about miscellaneous.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
new file mode 100644
index 00000000..36ca7f98
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_disputes
+description: 'The following are multiple choice questions (with answers) about moral
+  disputes.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
new file mode 100644
index 00000000..5415d5b4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_scenarios
+description: 'The following are multiple choice questions (with answers) about moral
+  scenarios.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
new file mode 100644
index 00000000..34c0040a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
@@ -0,0 +1,7 @@
+dataset_name: nutrition
+description: 'The following are multiple choice questions (with answers) about nutrition.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
new file mode 100644
index 00000000..83588531
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
@@ -0,0 +1,7 @@
+dataset_name: philosophy
+description: 'The following are multiple choice questions (with answers) about philosophy.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
new file mode 100644
index 00000000..a94b514b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
@@ -0,0 +1,7 @@
+dataset_name: prehistory
+description: 'The following are multiple choice questions (with answers) about prehistory.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
new file mode 100644
index 00000000..98fa6bd8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_accounting
+description: 'The following are multiple choice questions (with answers) about professional
+  accounting.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
new file mode 100644
index 00000000..aada41a7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_law
+description: 'The following are multiple choice questions (with answers) about professional
+  law.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
new file mode 100644
index 00000000..3febeb67
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_medicine
+description: 'The following are multiple choice questions (with answers) about professional
+  medicine.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
new file mode 100644
index 00000000..33b77f62
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_psychology
+description: 'The following are multiple choice questions (with answers) about professional
+  psychology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
new file mode 100644
index 00000000..dd7f9976
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
@@ -0,0 +1,8 @@
+dataset_name: public_relations
+description: 'The following are multiple choice questions (with answers) about public
+  relations.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
new file mode 100644
index 00000000..cc236bd4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
@@ -0,0 +1,8 @@
+dataset_name: security_studies
+description: 'The following are multiple choice questions (with answers) about security
+  studies.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
new file mode 100644
index 00000000..11069f9e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
@@ -0,0 +1,7 @@
+dataset_name: sociology
+description: 'The following are multiple choice questions (with answers) about sociology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_sociology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
new file mode 100644
index 00000000..42b51e5e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
@@ -0,0 +1,8 @@
+dataset_name: us_foreign_policy
+description: 'The following are multiple choice questions (with answers) about us
+  foreign policy.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
new file mode 100644
index 00000000..9a9b94a1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
@@ -0,0 +1,7 @@
+dataset_name: virology
+description: 'The following are multiple choice questions (with answers) about virology.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_virology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
new file mode 100644
index 00000000..1525efe6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
@@ -0,0 +1,8 @@
+dataset_name: world_religions
+description: 'The following are multiple choice questions (with answers) about world
+  religions.
+
+
+  '
+include: _mmlu_flan_loglikelihood_template_yaml
+task: mmlu_flan_n_shot_loglikelihood_world_religions
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
new file mode 100644
index 00000000..a4e54c9e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
@@ -0,0 +1,8 @@
+dataset_name: machine_learning
+description: 'The following are multiple choice questions (with answers) about machine
+  learning.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
new file mode 100644
index 00000000..9c0c65b0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
@@ -0,0 +1,7 @@
+dataset_name: management
+description: 'The following are multiple choice questions (with answers) about management.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_management
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
new file mode 100644
index 00000000..e2a74ca0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
@@ -0,0 +1,7 @@
+dataset_name: marketing
+description: 'The following are multiple choice questions (with answers) about marketing.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_marketing
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
new file mode 100644
index 00000000..2c27958f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
@@ -0,0 +1,8 @@
+dataset_name: medical_genetics
+description: 'The following are multiple choice questions (with answers) about medical
+  genetics.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
new file mode 100644
index 00000000..389ca552
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
@@ -0,0 +1,7 @@
+dataset_name: miscellaneous
+description: 'The following are multiple choice questions (with answers) about miscellaneous.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
new file mode 100644
index 00000000..5f869327
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_disputes
+description: 'The following are multiple choice questions (with answers) about moral
+  disputes.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
new file mode 100644
index 00000000..ecc63596
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
@@ -0,0 +1,8 @@
+dataset_name: moral_scenarios
+description: 'The following are multiple choice questions (with answers) about moral
+  scenarios.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
new file mode 100644
index 00000000..6d2da5cb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
@@ -0,0 +1,7 @@
+dataset_name: nutrition
+description: 'The following are multiple choice questions (with answers) about nutrition.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
new file mode 100644
index 00000000..421c50f9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
@@ -0,0 +1,7 @@
+dataset_name: philosophy
+description: 'The following are multiple choice questions (with answers) about philosophy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
new file mode 100644
index 00000000..6e534911
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
@@ -0,0 +1,7 @@
+dataset_name: prehistory
+description: 'The following are multiple choice questions (with answers) about prehistory.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
new file mode 100644
index 00000000..93afd0fb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_accounting
+description: 'The following are multiple choice questions (with answers) about professional
+  accounting.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
new file mode 100644
index 00000000..d1e02680
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_law
+description: 'The following are multiple choice questions (with answers) about professional
+  law.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
new file mode 100644
index 00000000..2e39c273
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_medicine
+description: 'The following are multiple choice questions (with answers) about professional
+  medicine.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
new file mode 100644
index 00000000..2de37e23
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
@@ -0,0 +1,8 @@
+dataset_name: professional_psychology
+description: 'The following are multiple choice questions (with answers) about professional
+  psychology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
new file mode 100644
index 00000000..d87a9a0a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
@@ -0,0 +1,8 @@
+dataset_name: public_relations
+description: 'The following are multiple choice questions (with answers) about public
+  relations.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
new file mode 100644
index 00000000..84c4fa9e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
@@ -0,0 +1,8 @@
+dataset_name: security_studies
+description: 'The following are multiple choice questions (with answers) about security
+  studies.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
new file mode 100644
index 00000000..bca11a0a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
@@ -0,0 +1,7 @@
+dataset_name: sociology
+description: 'The following are multiple choice questions (with answers) about sociology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_sociology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
new file mode 100644
index 00000000..4672df82
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,8 @@
+dataset_name: us_foreign_policy
+description: 'The following are multiple choice questions (with answers) about us
+  foreign policy.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
new file mode 100644
index 00000000..6f6d1680
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
@@ -0,0 +1,7 @@
+dataset_name: virology
+description: 'The following are multiple choice questions (with answers) about virology.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_virology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
new file mode 100644
index 00000000..e53b98c8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
@@ -0,0 +1,8 @@
+dataset_name: world_religions
+description: 'The following are multiple choice questions (with answers) about world
+  religions.
+
+
+  '
+include: _mmlu_flan_generative_template_yaml
+task: mmlu_flan_n_shot_world_religions
-- 
GitLab


From 06d3406e55d4fdc4d8f92061ff8143257aad87ed Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 04:58:40 +0000
Subject: [PATCH 024/212] update

---
 .../flan/prompt_templates/flan_anli.yaml      |  36 ++---
 .../flan/prompt_templates/flan_bbh.yaml       |  29 ----
 .../flan/yaml_templates/held_in_template_yaml |   2 +-
 lm_eval/benchmarks/flan_held_in.yaml          |  20 +--
 lm_eval/benchmarks/flan_held_out.yaml         |   4 +-
 lm_eval/benchmarks/t0_eval.yaml               | 145 ++++++++++--------
 lm_eval/tasks/bbh/_generate_configs.py        |   3 +
 lm_eval/tasks/bbh/_template_yaml              |   6 +-
 lm_eval/tasks/bbh/boolean_expressions.yaml    |   4 -
 lm_eval/tasks/bbh/causal_judgement.yaml       |   4 -
 lm_eval/tasks/bbh/date_understanding.yaml     |   4 -
 lm_eval/tasks/bbh/disambiguation_qa.yaml      |   4 -
 lm_eval/tasks/bbh/dyck_languages.yaml         |   4 -
 lm_eval/tasks/bbh/formal_fallacies.yaml       |   4 -
 lm_eval/tasks/bbh/geometric_shapes.yaml       |   4 -
 lm_eval/tasks/bbh/hyperbaton.yaml             |   4 -
 .../bbh/logical_deduction_five_objects.yaml   |   4 -
 .../bbh/logical_deduction_seven_objects.yaml  |   4 -
 .../bbh/logical_deduction_three_objects.yaml  |   4 -
 lm_eval/tasks/bbh/movie_recommendation.yaml   |   4 -
 .../tasks/bbh/multistep_arithmetic_two.yaml   |   4 -
 lm_eval/tasks/bbh/navigate.yaml               |   4 -
 lm_eval/tasks/bbh/object_counting.yaml        |   4 -
 lm_eval/tasks/bbh/penguins_in_a_table.yaml    |   4 -
 .../bbh/reasoning_about_colored_objects.yaml  |   4 -
 lm_eval/tasks/bbh/ruin_names.yaml             |   4 -
 .../salient_translation_error_detection.yaml  |   4 -
 lm_eval/tasks/bbh/snarks.yaml                 |   4 -
 lm_eval/tasks/bbh/sports_understanding.yaml   |   4 -
 lm_eval/tasks/bbh/temporal_sequences.yaml     |   4 -
 ...racking_shuffled_objects_five_objects.yaml |   4 -
 ...acking_shuffled_objects_seven_objects.yaml |   4 -
 ...acking_shuffled_objects_three_objects.yaml |   4 -
 lm_eval/tasks/bbh/web_of_lies.yaml            |   4 -
 lm_eval/tasks/bbh/word_sorting.yaml           |   4 -
 lm_eval/tasks/mmlu/_generate_configs.py       |   2 +-
 .../_mmlu_flan_generative_template_yaml       |  17 +-
 .../_mmlu_flan_loglikelihood_template_yaml    |  10 +-
 .../flan_n_shot/mmlu_abstract_algebra.yaml    |   3 +-
 .../flan_n_shot/mmlu_business_ethics.yaml     |   3 +-
 .../flan_n_shot/mmlu_clinical_knowledge.yaml  |   3 +-
 .../flan_n_shot/mmlu_college_biology.yaml     |   3 +-
 .../flan_n_shot/mmlu_college_chemistry.yaml   |   3 +-
 .../mmlu_college_computer_science.yaml        |   3 +-
 .../flan_n_shot/mmlu_college_mathematics.yaml |   3 +-
 .../flan_n_shot/mmlu_college_medicine.yaml    |   3 +-
 .../flan_n_shot/mmlu_college_physics.yaml     |   3 +-
 .../flan_n_shot/mmlu_computer_security.yaml   |   3 +-
 .../flan_n_shot/mmlu_conceptual_physics.yaml  |   3 +-
 .../mmlu_electrical_engineering.yaml          |   3 +-
 .../mmlu_elementary_mathematics.yaml          |   3 +-
 .../mmlu/flan_n_shot/mmlu_formal_logic.yaml   |   3 +-
 .../mmlu/flan_n_shot/mmlu_global_facts.yaml   |   3 +-
 .../flan_n_shot/mmlu_high_school_biology.yaml |   3 +-
 .../mmlu_high_school_chemistry.yaml           |   3 +-
 .../mmlu_high_school_computer_science.yaml    |   3 +-
 .../mmlu_high_school_european_history.yaml    |   3 +-
 .../mmlu_high_school_geography.yaml           |   3 +-
 ...u_high_school_government_and_politics.yaml |   3 +-
 .../mmlu_high_school_macroeconomics.yaml      |   3 +-
 .../mmlu_high_school_mathematics.yaml         |   3 +-
 .../mmlu_high_school_microeconomics.yaml      |   3 +-
 .../flan_n_shot/mmlu_high_school_physics.yaml |   3 +-
 .../mmlu_high_school_psychology.yaml          |   3 +-
 .../mmlu_high_school_statistics.yaml          |   3 +-
 .../mmlu_high_school_us_history.yaml          |   3 +-
 .../mmlu_high_school_world_history.yaml       |   3 +-
 .../mmlu/flan_n_shot/mmlu_human_aging.yaml    |   3 +-
 .../flan_n_shot/mmlu_human_sexuality.yaml     |   3 +-
 .../flan_n_shot/mmlu_international_law.yaml   |   3 +-
 .../flan_n_shot/mmlu_logical_fallacies.yaml   |   3 +-
 .../mmlu_loglikelihood_abstract_algebra.yaml  |   3 +-
 .../mmlu_loglikelihood_business_ethics.yaml   |   3 +-
 ...mmlu_loglikelihood_clinical_knowledge.yaml |   3 +-
 .../mmlu_loglikelihood_college_biology.yaml   |   3 +-
 .../mmlu_loglikelihood_college_chemistry.yaml |   3 +-
 ...oglikelihood_college_computer_science.yaml |   3 +-
 ...mlu_loglikelihood_college_mathematics.yaml |   3 +-
 .../mmlu_loglikelihood_college_medicine.yaml  |   3 +-
 .../mmlu_loglikelihood_college_physics.yaml   |   3 +-
 .../mmlu_loglikelihood_computer_security.yaml |   3 +-
 ...mmlu_loglikelihood_conceptual_physics.yaml |   3 +-
 ..._loglikelihood_electrical_engineering.yaml |   3 +-
 ..._loglikelihood_elementary_mathematics.yaml |   3 +-
 .../mmlu_loglikelihood_formal_logic.yaml      |   3 +-
 .../mmlu_loglikelihood_global_facts.yaml      |   3 +-
 ...mlu_loglikelihood_high_school_biology.yaml |   3 +-
 ...u_loglikelihood_high_school_chemistry.yaml |   3 +-
 ...kelihood_high_school_computer_science.yaml |   3 +-
 ...kelihood_high_school_european_history.yaml |   3 +-
 ...u_loglikelihood_high_school_geography.yaml |   3 +-
 ...d_high_school_government_and_politics.yaml |   3 +-
 ...likelihood_high_school_macroeconomics.yaml |   3 +-
 ...loglikelihood_high_school_mathematics.yaml |   3 +-
 ...likelihood_high_school_microeconomics.yaml |   3 +-
 ...mlu_loglikelihood_high_school_physics.yaml |   3 +-
 ..._loglikelihood_high_school_psychology.yaml |   3 +-
 ..._loglikelihood_high_school_statistics.yaml |   3 +-
 ..._loglikelihood_high_school_us_history.yaml |   3 +-
 ...glikelihood_high_school_world_history.yaml |   3 +-
 .../mmlu_loglikelihood_human_aging.yaml       |   3 +-
 .../mmlu_loglikelihood_human_sexuality.yaml   |   3 +-
 .../mmlu_loglikelihood_international_law.yaml |   3 +-
 .../mmlu_loglikelihood_logical_fallacies.yaml |   3 +-
 .../mmlu_loglikelihood_machine_learning.yaml  |   3 +-
 .../mmlu_loglikelihood_medical_genetics.yaml  |   3 +-
 .../mmlu_loglikelihood_moral_disputes.yaml    |   3 +-
 .../mmlu_loglikelihood_moral_scenarios.yaml   |   3 +-
 ...loglikelihood_professional_accounting.yaml |   3 +-
 .../mmlu_loglikelihood_professional_law.yaml  |   3 +-
 ...u_loglikelihood_professional_medicine.yaml |   3 +-
 ...loglikelihood_professional_psychology.yaml |   3 +-
 .../mmlu_loglikelihood_public_relations.yaml  |   3 +-
 .../mmlu_loglikelihood_security_studies.yaml  |   3 +-
 .../mmlu_loglikelihood_us_foreign_policy.yaml |   3 +-
 .../mmlu_loglikelihood_world_religions.yaml   |   3 +-
 .../flan_n_shot/mmlu_machine_learning.yaml    |   3 +-
 .../flan_n_shot/mmlu_medical_genetics.yaml    |   3 +-
 .../mmlu/flan_n_shot/mmlu_moral_disputes.yaml |   3 +-
 .../flan_n_shot/mmlu_moral_scenarios.yaml     |   3 +-
 .../mmlu_professional_accounting.yaml         |   3 +-
 .../flan_n_shot/mmlu_professional_law.yaml    |   3 +-
 .../mmlu_professional_medicine.yaml           |   3 +-
 .../mmlu_professional_psychology.yaml         |   3 +-
 .../flan_n_shot/mmlu_public_relations.yaml    |   3 +-
 .../flan_n_shot/mmlu_security_studies.yaml    |   3 +-
 .../flan_n_shot/mmlu_us_foreign_policy.yaml   |   3 +-
 .../flan_n_shot/mmlu_world_religions.yaml     |   3 +-
 lm_eval/tasks/super_glue/cb/t5_utils.py       |   4 +-
 129 files changed, 224 insertions(+), 432 deletions(-)
 delete mode 100644 lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
 delete mode 100644 lm_eval/tasks/bbh/boolean_expressions.yaml
 delete mode 100644 lm_eval/tasks/bbh/causal_judgement.yaml
 delete mode 100644 lm_eval/tasks/bbh/date_understanding.yaml
 delete mode 100644 lm_eval/tasks/bbh/disambiguation_qa.yaml
 delete mode 100644 lm_eval/tasks/bbh/dyck_languages.yaml
 delete mode 100644 lm_eval/tasks/bbh/formal_fallacies.yaml
 delete mode 100644 lm_eval/tasks/bbh/geometric_shapes.yaml
 delete mode 100644 lm_eval/tasks/bbh/hyperbaton.yaml
 delete mode 100644 lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/movie_recommendation.yaml
 delete mode 100644 lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
 delete mode 100644 lm_eval/tasks/bbh/navigate.yaml
 delete mode 100644 lm_eval/tasks/bbh/object_counting.yaml
 delete mode 100644 lm_eval/tasks/bbh/penguins_in_a_table.yaml
 delete mode 100644 lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/ruin_names.yaml
 delete mode 100644 lm_eval/tasks/bbh/salient_translation_error_detection.yaml
 delete mode 100644 lm_eval/tasks/bbh/snarks.yaml
 delete mode 100644 lm_eval/tasks/bbh/sports_understanding.yaml
 delete mode 100644 lm_eval/tasks/bbh/temporal_sequences.yaml
 delete mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
 delete mode 100644 lm_eval/tasks/bbh/web_of_lies.yaml
 delete mode 100644 lm_eval/tasks/bbh/word_sorting.yaml

diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
index 7dae0ce0..9b9f6705 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -1,29 +1,29 @@
 # Flan Prompt Templates
 prompts:
   "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: """{{["Yes", "It's impossible to say", "No"][label]}}"""
+    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
   "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
deleted file mode 100644
index 525e9e0c..00000000
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
diff --git a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
index f28774c6..2f4a4c84 100644
--- a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -8,6 +8,6 @@ metric_list:
     ignore_punctuation: true
 generation_kwargs:
   until:
-    - "\n\n"
+    - "</s>"
   do_sample: false
   temperature: 0.0
diff --git a/lm_eval/benchmarks/flan_held_in.yaml b/lm_eval/benchmarks/flan_held_in.yaml
index a560bda8..f6d62f03 100644
--- a/lm_eval/benchmarks/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -25,13 +25,13 @@ task:
     dataset_path: anli
     use_prompt: flan/prompt_templates/flan_anli.yaml:*
     validation_split: dev_r3
-  # - include: flan/yaml_templates/held_in_template_yaml
-  #   task: ai2_arc
-  #   dataset_path: ARC-Easy
-  #   use_prompt: local:*
-  #   validation_split: validation
-  # - include: flan/yaml_templates/held_in_template_yaml
-  #   task: ai2_arc
-  #   dataset_path: ARC-Challange
-  #   use_prompt: local:*
-  #   validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: ai2_arc
+    dataset_path: ARC-Easy
+    use_prompt: local:*
+    validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: ai2_arc
+    dataset_path: ARC-Challange
+    use_prompt: local:*
+    validation_split: validation
diff --git a/lm_eval/benchmarks/flan_held_out.yaml b/lm_eval/benchmarks/flan_held_out.yaml
index 4cd56468..cde82722 100644
--- a/lm_eval/benchmarks/flan_held_out.yaml
+++ b/lm_eval/benchmarks/flan_held_out.yaml
@@ -1,4 +1,4 @@
 group: flan_held_out
 task:
-  - bbh
-  - mmlu
+  - bbh_flan
+  - mmlu_flan
diff --git a/lm_eval/benchmarks/t0_eval.yaml b/lm_eval/benchmarks/t0_eval.yaml
index 46c28d64..9cd25b51 100644
--- a/lm_eval/benchmarks/t0_eval.yaml
+++ b/lm_eval/benchmarks/t0_eval.yaml
@@ -6,6 +6,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
+    output_type: greedy_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -18,18 +19,6 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Natural Language Inference
-  - dataset_path: super_glue
-    dataset_name: cb
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
     output_type: greedy_until
     metric_list:
       - metric: exact_match
@@ -37,67 +26,86 @@ task:
         higher_is_better: true
         ignore_case: true
         ignore_punctuation: true
-  - dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r1
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r1
-    validation_split: dev_r1
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r2
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r2
-    validation_split: dev_r2
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r3
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r3
-    validation_split: dev_r3
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Sentence Completion
-  - dataset_path: super_glue
-    dataset_name: copa
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
+  # # Natural Language Inference
+  # - dataset_path: super_glue
+  #   dataset_name: cb
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # - dataset_path: super_glue
+  #   dataset_name: rte
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # - task: anli_r1
+  #   dataset_path: anli
+  #   use_prompt: promptsource:*
+  #   training_split: train_r1
+  #   validation_split: dev_r1
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # - task: anli_r2
+  #   dataset_path: anli
+  #   use_prompt: promptsource:*
+  #   training_split: train_r2
+  #   validation_split: dev_r2
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # - task: anli_r3
+  #   dataset_path: anli
+  #   use_prompt: promptsource:*
+  #   training_split: train_r3
+  #   validation_split: dev_r3
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Sentence Completion
+  # - dataset_path: super_glue
+  #   dataset_name: copa
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
   # Natural Language Inference
   - dataset_path: hellaswag
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
+    output_type: greedy_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -110,6 +118,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
+    output_type: greedy_until
     metric_list:
       - metric: exact_match
         aggregation: mean
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index 40e4c07d..9e603994 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -27,3 +27,6 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
+# https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/boolean_expressions.txt
diff --git a/lm_eval/tasks/bbh/_template_yaml b/lm_eval/tasks/bbh/_template_yaml
index af6b74b3..3b174480 100644
--- a/lm_eval/tasks/bbh/_template_yaml
+++ b/lm_eval/tasks/bbh/_template_yaml
@@ -2,16 +2,14 @@ group: bbh
 dataset_path: lukaemon/bbh
 output_type: greedy_until
 test_split: test
-doc_to_text: "{{input}}"
+doc_to_text: "Q: {{input}}\nA:"
 doc_to_target: "{{target}}"
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: false
 generation_kwargs:
   until:
-    - "\n\n"
+    - "</s>"
   do_sample: false
   temperature: 0.0
diff --git a/lm_eval/tasks/bbh/boolean_expressions.yaml b/lm_eval/tasks/bbh/boolean_expressions.yaml
deleted file mode 100644
index d9895c81..00000000
--- a/lm_eval/tasks/bbh/boolean_expressions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: boolean_expressions
-include: _template_yaml
-task: bbh_boolean_expressions
diff --git a/lm_eval/tasks/bbh/causal_judgement.yaml b/lm_eval/tasks/bbh/causal_judgement.yaml
deleted file mode 100644
index c3d48d53..00000000
--- a/lm_eval/tasks/bbh/causal_judgement.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: causal_judgement
-include: _template_yaml
-task: bbh_causal_judgement
diff --git a/lm_eval/tasks/bbh/date_understanding.yaml b/lm_eval/tasks/bbh/date_understanding.yaml
deleted file mode 100644
index 5f60efbe..00000000
--- a/lm_eval/tasks/bbh/date_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: date_understanding
-include: _template_yaml
-task: bbh_date_understanding
diff --git a/lm_eval/tasks/bbh/disambiguation_qa.yaml b/lm_eval/tasks/bbh/disambiguation_qa.yaml
deleted file mode 100644
index b043460e..00000000
--- a/lm_eval/tasks/bbh/disambiguation_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: disambiguation_qa
-include: _template_yaml
-task: bbh_disambiguation_qa
diff --git a/lm_eval/tasks/bbh/dyck_languages.yaml b/lm_eval/tasks/bbh/dyck_languages.yaml
deleted file mode 100644
index 6b6648d0..00000000
--- a/lm_eval/tasks/bbh/dyck_languages.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: dyck_languages
-include: _template_yaml
-task: bbh_dyck_languages
diff --git a/lm_eval/tasks/bbh/formal_fallacies.yaml b/lm_eval/tasks/bbh/formal_fallacies.yaml
deleted file mode 100644
index 18d30c91..00000000
--- a/lm_eval/tasks/bbh/formal_fallacies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: formal_fallacies
-include: _template_yaml
-task: bbh_formal_fallacies
diff --git a/lm_eval/tasks/bbh/geometric_shapes.yaml b/lm_eval/tasks/bbh/geometric_shapes.yaml
deleted file mode 100644
index 9616b6d6..00000000
--- a/lm_eval/tasks/bbh/geometric_shapes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: geometric_shapes
-include: _template_yaml
-task: bbh_geometric_shapes
diff --git a/lm_eval/tasks/bbh/hyperbaton.yaml b/lm_eval/tasks/bbh/hyperbaton.yaml
deleted file mode 100644
index d1ff5bf8..00000000
--- a/lm_eval/tasks/bbh/hyperbaton.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: hyperbaton
-include: _template_yaml
-task: bbh_hyperbaton
diff --git a/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
deleted file mode 100644
index 91e6ec74..00000000
--- a/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_five_objects
-include: _template_yaml
-task: bbh_logical_deduction_five_objects
diff --git a/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
deleted file mode 100644
index 342cf563..00000000
--- a/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_seven_objects
-include: _template_yaml
-task: bbh_logical_deduction_seven_objects
diff --git a/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
deleted file mode 100644
index 6669c6c8..00000000
--- a/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_three_objects
-include: _template_yaml
-task: bbh_logical_deduction_three_objects
diff --git a/lm_eval/tasks/bbh/movie_recommendation.yaml b/lm_eval/tasks/bbh/movie_recommendation.yaml
deleted file mode 100644
index af884eec..00000000
--- a/lm_eval/tasks/bbh/movie_recommendation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: movie_recommendation
-include: _template_yaml
-task: bbh_movie_recommendation
diff --git a/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
deleted file mode 100644
index 2ab191b8..00000000
--- a/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: multistep_arithmetic_two
-include: _template_yaml
-task: bbh_multistep_arithmetic_two
diff --git a/lm_eval/tasks/bbh/navigate.yaml b/lm_eval/tasks/bbh/navigate.yaml
deleted file mode 100644
index f737a9c5..00000000
--- a/lm_eval/tasks/bbh/navigate.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: navigate
-include: _template_yaml
-task: bbh_navigate
diff --git a/lm_eval/tasks/bbh/object_counting.yaml b/lm_eval/tasks/bbh/object_counting.yaml
deleted file mode 100644
index 606bd92d..00000000
--- a/lm_eval/tasks/bbh/object_counting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: object_counting
-include: _template_yaml
-task: bbh_object_counting
diff --git a/lm_eval/tasks/bbh/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/penguins_in_a_table.yaml
deleted file mode 100644
index 25e183ce..00000000
--- a/lm_eval/tasks/bbh/penguins_in_a_table.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: penguins_in_a_table
-include: _template_yaml
-task: bbh_penguins_in_a_table
diff --git a/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
deleted file mode 100644
index 785e0b2e..00000000
--- a/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: reasoning_about_colored_objects
-include: _template_yaml
-task: bbh_reasoning_about_colored_objects
diff --git a/lm_eval/tasks/bbh/ruin_names.yaml b/lm_eval/tasks/bbh/ruin_names.yaml
deleted file mode 100644
index aef28b1c..00000000
--- a/lm_eval/tasks/bbh/ruin_names.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: ruin_names
-include: _template_yaml
-task: bbh_ruin_names
diff --git a/lm_eval/tasks/bbh/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
deleted file mode 100644
index 433867fe..00000000
--- a/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: salient_translation_error_detection
-include: _template_yaml
-task: bbh_salient_translation_error_detection
diff --git a/lm_eval/tasks/bbh/snarks.yaml b/lm_eval/tasks/bbh/snarks.yaml
deleted file mode 100644
index 49f57d20..00000000
--- a/lm_eval/tasks/bbh/snarks.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: snarks
-include: _template_yaml
-task: bbh_snarks
diff --git a/lm_eval/tasks/bbh/sports_understanding.yaml b/lm_eval/tasks/bbh/sports_understanding.yaml
deleted file mode 100644
index cf84b1e1..00000000
--- a/lm_eval/tasks/bbh/sports_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: sports_understanding
-include: _template_yaml
-task: bbh_sports_understanding
diff --git a/lm_eval/tasks/bbh/temporal_sequences.yaml b/lm_eval/tasks/bbh/temporal_sequences.yaml
deleted file mode 100644
index b3f5c0af..00000000
--- a/lm_eval/tasks/bbh/temporal_sequences.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: temporal_sequences
-include: _template_yaml
-task: bbh_temporal_sequences
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
deleted file mode 100644
index d4ca2fe0..00000000
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_five_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_five_objects
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
deleted file mode 100644
index 20fff820..00000000
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_seven_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_seven_objects
diff --git a/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
deleted file mode 100644
index f219d30c..00000000
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_three_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_three_objects
diff --git a/lm_eval/tasks/bbh/web_of_lies.yaml b/lm_eval/tasks/bbh/web_of_lies.yaml
deleted file mode 100644
index 18dcb591..00000000
--- a/lm_eval/tasks/bbh/web_of_lies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: web_of_lies
-include: _template_yaml
-task: bbh_web_of_lies
diff --git a/lm_eval/tasks/bbh/word_sorting.yaml b/lm_eval/tasks/bbh/word_sorting.yaml
deleted file mode 100644
index 11725e0a..00000000
--- a/lm_eval/tasks/bbh/word_sorting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: word_sorting
-include: _template_yaml
-task: bbh_word_sorting
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index db0e20cd..af9bd0c6 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -115,4 +115,4 @@ if __name__ == "__main__":
         file_save_path = args.save_prefix_path + f"_{subject}.yaml"
         eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
         with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file)
+            yaml.dump(yaml_dict, yaml_file, width=float("inf"))
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index b369024c..3f649666 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -1,18 +1,21 @@
 group: mmlu_flan
 dataset_path: cais/mmlu
-validation_split: validation
+# validation_split: validation
+test_split: test
 fewshot_split: dev
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
+# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 output_type: greedy_until
-doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
-  do_sample: false
-  temperature: 0.0
\ No newline at end of file
+#   do_sample: false
+#   temperature: 0.0
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index eb38e0fe..2a09f787 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -1,12 +1,16 @@
 group: mmlu_flan_loglikelihood
 dataset_path: cais/mmlu
-validation_split: validation
+# validation_split: validation
+test_split: test
 fewshot_split: dev
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
 output_type: multiple_choice
-doc_to_choice: ['(A)', '(B)', '(C)', '(D)']
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: answer
 metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
index 9fca2117..31729f37 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
@@ -1,6 +1,5 @@
 dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract
-  algebra.
+description: 'The following are multiple choice questions (with answers) about abstract algebra.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
index f75a48fd..d1dcf3c7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
@@ -1,6 +1,5 @@
 dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business
-  ethics.
+description: 'The following are multiple choice questions (with answers) about business ethics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
index 07a3fe79..14b12359 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,5 @@
 dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
+description: 'The following are multiple choice questions (with answers) about clinical knowledge.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
index 7465f0d3..0d202b8e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college
-  biology.
+description: 'The following are multiple choice questions (with answers) about college biology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
index 17e7fbde..77f6328f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college
-  chemistry.
+description: 'The following are multiple choice questions (with answers) about college chemistry.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
index d0032874..f5cbda28 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
+description: 'The following are multiple choice questions (with answers) about college computer science.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
index be1e01b2..dbc9be4c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college
-  mathematics.
+description: 'The following are multiple choice questions (with answers) about college mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
index 4c8aa79e..efc868f0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college
-  medicine.
+description: 'The following are multiple choice questions (with answers) about college medicine.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
index cd07980b..d92c14ea 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
+description: 'The following are multiple choice questions (with answers) about college physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
index 93dc9040..3ddf3ee5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
@@ -1,6 +1,5 @@
 dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer
-  security.
+description: 'The following are multiple choice questions (with answers) about computer security.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
index 2f313298..7c4f90ed 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual
-  physics.
+description: 'The following are multiple choice questions (with answers) about conceptual physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
index 5d1ccca1..0308fe16 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
@@ -1,6 +1,5 @@
 dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical
-  engineering.
+description: 'The following are multiple choice questions (with answers) about electrical engineering.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
index 7260b752..2b8a8caf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary
-  mathematics.
+description: 'The following are multiple choice questions (with answers) about elementary mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
index 7dd42af5..10f58f41 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
@@ -1,6 +1,5 @@
 dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal
-  logic.
+description: 'The following are multiple choice questions (with answers) about formal logic.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
index 373f99fb..48816fe3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
@@ -1,6 +1,5 @@
 dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global
-  facts.
+description: 'The following are multiple choice questions (with answers) about global facts.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
index 334286dc..ebb1ded2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high
-  school biology.
+description: 'The following are multiple choice questions (with answers) about high school biology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
index f4d3bcfb..66a484a3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high
-  school chemistry.
+description: 'The following are multiple choice questions (with answers) about high school chemistry.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
index 03e0411b..b9a9060c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high
-  school computer science.
+description: 'The following are multiple choice questions (with answers) about high school computer science.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
index a7a73a24..f89cca29 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high
-  school european history.
+description: 'The following are multiple choice questions (with answers) about high school european history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
index d7b29960..f255d37a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
+description: 'The following are multiple choice questions (with answers) about high school geography.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
index dbb195e2..108aebf8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
+description: 'The following are multiple choice questions (with answers) about high school government and politics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
index a8e6c4cd..720baeac 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
+description: 'The following are multiple choice questions (with answers) about high school macroeconomics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
index 35adc8b7..fbad67d6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high
-  school mathematics.
+description: 'The following are multiple choice questions (with answers) about high school mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
index a0887261..4b4e85bd 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
+description: 'The following are multiple choice questions (with answers) about high school microeconomics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
index 63a77a7d..941d6c22 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high
-  school physics.
+description: 'The following are multiple choice questions (with answers) about high school physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
index 7d738494..831907f0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
+description: 'The following are multiple choice questions (with answers) about high school psychology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
index 6b3e4b5a..255c7394 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
+description: 'The following are multiple choice questions (with answers) about high school statistics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
index e80b64e1..4ea76cd3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high
-  school us history.
+description: 'The following are multiple choice questions (with answers) about high school us history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
index 83d55ae6..26551e82 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high
-  school world history.
+description: 'The following are multiple choice questions (with answers) about high school world history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
index 2ff416b8..042e81cf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
@@ -1,6 +1,5 @@
 dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
+description: 'The following are multiple choice questions (with answers) about human aging.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
index 2d7316ac..d2d55b70 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
@@ -1,6 +1,5 @@
 dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
+description: 'The following are multiple choice questions (with answers) about human sexuality.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
index 547cb3c0..12b18807 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
@@ -1,6 +1,5 @@
 dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
+description: 'The following are multiple choice questions (with answers) about international law.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
index 1b8a4d2b..aaaef665 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
@@ -1,6 +1,5 @@
 dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
+description: 'The following are multiple choice questions (with answers) about logical fallacies.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
index 60973953..19f6d3ee 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
@@ -1,6 +1,5 @@
 dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract
-  algebra.
+description: 'The following are multiple choice questions (with answers) about abstract algebra.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
index 3f2bcc77..b6917938 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
@@ -1,6 +1,5 @@
 dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business
-  ethics.
+description: 'The following are multiple choice questions (with answers) about business ethics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
index 780c8bf6..bc19e7c5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
@@ -1,6 +1,5 @@
 dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
+description: 'The following are multiple choice questions (with answers) about clinical knowledge.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
index 7d270f47..defc3d98 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college
-  biology.
+description: 'The following are multiple choice questions (with answers) about college biology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
index e947d1a2..15a2b3cc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college
-  chemistry.
+description: 'The following are multiple choice questions (with answers) about college chemistry.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
index a23b0bd3..ff69b70e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
+description: 'The following are multiple choice questions (with answers) about college computer science.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
index c03033eb..fb67c2ee 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college
-  mathematics.
+description: 'The following are multiple choice questions (with answers) about college mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
index 64f952bb..6edac775 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college
-  medicine.
+description: 'The following are multiple choice questions (with answers) about college medicine.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
index f339c316..2af2929f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
+description: 'The following are multiple choice questions (with answers) about college physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
index cc28f843..fe239463 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
@@ -1,6 +1,5 @@
 dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer
-  security.
+description: 'The following are multiple choice questions (with answers) about computer security.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
index dc3c9096..a593cb97 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual
-  physics.
+description: 'The following are multiple choice questions (with answers) about conceptual physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
index 20823b42..06e8e3b0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
@@ -1,6 +1,5 @@
 dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical
-  engineering.
+description: 'The following are multiple choice questions (with answers) about electrical engineering.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
index afed59aa..ea151100 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary
-  mathematics.
+description: 'The following are multiple choice questions (with answers) about elementary mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
index 2a2359f5..2a2299fb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
@@ -1,6 +1,5 @@
 dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal
-  logic.
+description: 'The following are multiple choice questions (with answers) about formal logic.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
index 4d23b227..b3c003f3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
@@ -1,6 +1,5 @@
 dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global
-  facts.
+description: 'The following are multiple choice questions (with answers) about global facts.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
index a9bdefee..6a28adbd 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high
-  school biology.
+description: 'The following are multiple choice questions (with answers) about high school biology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
index ec512f42..dedab0ad 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high
-  school chemistry.
+description: 'The following are multiple choice questions (with answers) about high school chemistry.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
index 67d70ec6..8c096fd8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high
-  school computer science.
+description: 'The following are multiple choice questions (with answers) about high school computer science.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
index 62c6013b..8b2a2705 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high
-  school european history.
+description: 'The following are multiple choice questions (with answers) about high school european history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
index a4b6d856..32bcc3e1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
+description: 'The following are multiple choice questions (with answers) about high school geography.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
index f7c2cb8c..191bc63b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
+description: 'The following are multiple choice questions (with answers) about high school government and politics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
index b623360e..838ffed9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
+description: 'The following are multiple choice questions (with answers) about high school macroeconomics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
index b2ddfc0e..246d8988 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high
-  school mathematics.
+description: 'The following are multiple choice questions (with answers) about high school mathematics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
index e8bff48d..1fea1850 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
+description: 'The following are multiple choice questions (with answers) about high school microeconomics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
index 2b97ac1d..6aa802e8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high
-  school physics.
+description: 'The following are multiple choice questions (with answers) about high school physics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
index fecb5f70..521b3e54 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
+description: 'The following are multiple choice questions (with answers) about high school psychology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
index 4ff766db..3cd82472 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
+description: 'The following are multiple choice questions (with answers) about high school statistics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
index f725e916..34a7d05d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high
-  school us history.
+description: 'The following are multiple choice questions (with answers) about high school us history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
index 0142ce33..b6390aa3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
@@ -1,6 +1,5 @@
 dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high
-  school world history.
+description: 'The following are multiple choice questions (with answers) about high school world history.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
index 4b007ca1..bf454427 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
@@ -1,6 +1,5 @@
 dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
+description: 'The following are multiple choice questions (with answers) about human aging.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
index 37d5e42a..7bec1aa9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
@@ -1,6 +1,5 @@
 dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
+description: 'The following are multiple choice questions (with answers) about human sexuality.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
index 03987fdf..6d56237f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
@@ -1,6 +1,5 @@
 dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
+description: 'The following are multiple choice questions (with answers) about international law.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
index 3b8b7b98..bc2b1b41 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
@@ -1,6 +1,5 @@
 dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
+description: 'The following are multiple choice questions (with answers) about logical fallacies.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
index 473a2bc9..3688fd28 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
@@ -1,6 +1,5 @@
 dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine
-  learning.
+description: 'The following are multiple choice questions (with answers) about machine learning.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
index ef221495..68bd9c12 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
@@ -1,6 +1,5 @@
 dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
+description: 'The following are multiple choice questions (with answers) about medical genetics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
index 36ca7f98..c55d44bd 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
@@ -1,6 +1,5 @@
 dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
+description: 'The following are multiple choice questions (with answers) about moral disputes.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
index 5415d5b4..99bcb848 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
@@ -1,6 +1,5 @@
 dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
+description: 'The following are multiple choice questions (with answers) about moral scenarios.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
index 98fa6bd8..2f64beae 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional
-  accounting.
+description: 'The following are multiple choice questions (with answers) about professional accounting.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
index aada41a7..de0e8392 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional
-  law.
+description: 'The following are multiple choice questions (with answers) about professional law.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
index 3febeb67..ef4d0c07 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional
-  medicine.
+description: 'The following are multiple choice questions (with answers) about professional medicine.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
index 33b77f62..9bb12be0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional
-  psychology.
+description: 'The following are multiple choice questions (with answers) about professional psychology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
index dd7f9976..e9a761cc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
@@ -1,6 +1,5 @@
 dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
+description: 'The following are multiple choice questions (with answers) about public relations.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
index cc236bd4..6a141ba6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
@@ -1,6 +1,5 @@
 dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security
-  studies.
+description: 'The following are multiple choice questions (with answers) about security studies.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
index 42b51e5e..7aa77456 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
@@ -1,6 +1,5 @@
 dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
+description: 'The following are multiple choice questions (with answers) about us foreign policy.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
index 1525efe6..6f26fda1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
@@ -1,6 +1,5 @@
 dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
+description: 'The following are multiple choice questions (with answers) about world religions.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
index a4e54c9e..c97c9f09 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
@@ -1,6 +1,5 @@
 dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine
-  learning.
+description: 'The following are multiple choice questions (with answers) about machine learning.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
index 2c27958f..0464b15c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
@@ -1,6 +1,5 @@
 dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
+description: 'The following are multiple choice questions (with answers) about medical genetics.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
index 5f869327..671ca84e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
@@ -1,6 +1,5 @@
 dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
+description: 'The following are multiple choice questions (with answers) about moral disputes.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
index ecc63596..1ecbff40 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
@@ -1,6 +1,5 @@
 dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
+description: 'The following are multiple choice questions (with answers) about moral scenarios.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
index 93afd0fb..93b4e4d3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional
-  accounting.
+description: 'The following are multiple choice questions (with answers) about professional accounting.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
index d1e02680..a8704652 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional
-  law.
+description: 'The following are multiple choice questions (with answers) about professional law.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
index 2e39c273..137a39d5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional
-  medicine.
+description: 'The following are multiple choice questions (with answers) about professional medicine.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
index 2de37e23..342031f7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
@@ -1,6 +1,5 @@
 dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional
-  psychology.
+description: 'The following are multiple choice questions (with answers) about professional psychology.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
index d87a9a0a..88ffe1b8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
@@ -1,6 +1,5 @@
 dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
+description: 'The following are multiple choice questions (with answers) about public relations.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
index 84c4fa9e..b56c6803 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
@@ -1,6 +1,5 @@
 dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security
-  studies.
+description: 'The following are multiple choice questions (with answers) about security studies.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
index 4672df82..797fc9b7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,5 @@
 dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
+description: 'The following are multiple choice questions (with answers) about us foreign policy.
 
 
   '
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
index e53b98c8..b0253c46 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
@@ -1,6 +1,5 @@
 dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
+description: 'The following are multiple choice questions (with answers) about world religions.
 
 
   '
diff --git a/lm_eval/tasks/super_glue/cb/t5_utils.py b/lm_eval/tasks/super_glue/cb/t5_utils.py
index 644c2111..43eafce9 100644
--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
@@ -4,7 +4,9 @@ import sklearn.metrics
 def mean_3class_f1(predictions, references):  # This is a passthrough function
 
     string_label = ["entailment", "contradiction", "neutral"]
-    predictions = string_label.index(predictions[0]) if predictions[0] in string_label else 0
+    predictions = (
+        string_label.index(predictions[0]) if predictions[0] in string_label else 0
+    )
     references = string_label.index(references[0])
 
     return (predictions, references)
-- 
GitLab


From 13940f1ecd39377a25f86973c20e47c502271d31 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:39:50 +0000
Subject: [PATCH 025/212] add bbh flan_cot_fewshot

---
 .../_flan_cot_fewshot_template_yaml           | 22 +++++++++++++++++++
 .../flan_cot_fewshot/boolean_expressions.yaml |  5 +++++
 .../flan_cot_fewshot/causal_judgement.yaml    |  5 +++++
 .../flan_cot_fewshot/date_understanding.yaml  |  5 +++++
 .../flan_cot_fewshot/disambiguation_qa.yaml   |  5 +++++
 .../bbh/flan_cot_fewshot/dyck_languages.yaml  |  5 +++++
 .../flan_cot_fewshot/formal_fallacies.yaml    |  5 +++++
 .../flan_cot_fewshot/geometric_shapes.yaml    |  5 +++++
 .../bbh/flan_cot_fewshot/hyperbaton.yaml      |  5 +++++
 .../logical_deduction_five_objects.yaml       |  5 +++++
 .../logical_deduction_seven_objects.yaml      |  5 +++++
 .../logical_deduction_three_objects.yaml      |  5 +++++
 .../movie_recommendation.yaml                 |  5 +++++
 .../multistep_arithmetic_two.yaml             |  5 +++++
 .../tasks/bbh/flan_cot_fewshot/navigate.yaml  |  5 +++++
 .../bbh/flan_cot_fewshot/object_counting.yaml |  5 +++++
 .../flan_cot_fewshot/penguins_in_a_table.yaml |  5 +++++
 .../reasoning_about_colored_objects.yaml      |  5 +++++
 .../bbh/flan_cot_fewshot/ruin_names.yaml      |  5 +++++
 .../salient_translation_error_detection.yaml  |  5 +++++
 .../tasks/bbh/flan_cot_fewshot/snarks.yaml    |  5 +++++
 .../sports_understanding.yaml                 |  5 +++++
 .../flan_cot_fewshot/temporal_sequences.yaml  |  5 +++++
 ...racking_shuffled_objects_five_objects.yaml |  5 +++++
 ...acking_shuffled_objects_seven_objects.yaml |  5 +++++
 ...acking_shuffled_objects_three_objects.yaml |  5 +++++
 .../bbh/flan_cot_fewshot/web_of_lies.yaml     |  5 +++++
 .../bbh/flan_cot_fewshot/word_sorting.yaml    |  5 +++++
 28 files changed, 157 insertions(+)
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml

diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
new file mode 100644
index 00000000..d9dbf8c5
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -0,0 +1,22 @@
+group: bbh_flan_fewshot
+dataset_path: lukaemon/bbh
+output_type: greedy_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
new file mode 100644
index 00000000..849e0435
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": " not ( ( not not True ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not ( ( not not True ) ) = not ( ( A ) )\" where \"A = not not True\".\nLet's evaluate A: A = not not True = not (not True) = not False = True.\nPlugging in A, we get: Z = not ( ( A ) ) = not ( ( True ) ) = not True = False. So the answer is False.  True and False and not True and True is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = True and False and not True and True = A and B\" where \"A = True and False\" and \"B = not True and True\".\nLet's evaluate A: A = True and False = False.\nLet's evaluate B: B = not True and True = not (True and True) = not (True) = False.\nPlugging in A and B, we get: Z = A and B = False and False = False. So the answer is False.  not not ( not ( False ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not not ( not ( False ) ) = not not ( A )\" where \"A = not ( False )\".\nLet's evaluate A: A = not ( False ) = not False = True.\nPlugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
new file mode 100644
index 00000000..e7e80969
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": " How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild.\" A typical person would assume that this passage suggests that Frank T. had no intention of shooting and injuring someone and that the bullet accidentally hit the neighbor's body; therefore, we conclude that Frank T. did not intentionally hit his neighbor. So the answer is No.  How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that the boss ordered them both to arrive at the meeting room at the same time and that the motion detector was set up to be triggered if at least one person appeared in the room at the same time.\" A typical person would assume that the person probably meant to say the detector was set up to be triggered if \"both persons\" appeared in the room at the same time, not at least one person, since otherwise the phrase \"at the same time\" would not make much sense in that sentence. Because the motion detector went off, a typical person would therefore come to the conclusion that both Suzy and Billy triggered the motion detector to go off; hence, Billy did indeed cause the motion detector to go off. So the answer is Yes.  How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"He aims the dart at the low point region.\" A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
new file mode 100644
index 00000000..d5dc2117
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": " Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: Let's think step by step.\nIf today is Christmas Eve of 1937, then today's date is December 24, 1937. 10 days before today is December 14, 1937, that is 12/14/1937. So the answer is (D).  Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: Let's think step by step.\nIf tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from today is 11/11/2018. So the answer is (B).  Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: Let's think step by step.\nIf Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
new file mode 100644
index 00000000..2d08474c
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": " In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"they\" might be referring to. There are two possible referents for \"they\", namely the chief and the counselor. The verb \"told\" might be able to help us determine which one is more likely (if either). Let X be the chief and Y the counselor. The sentence is then of the form \"X told Y that (X or Y) did something.\"\nLet's consider Y first: \"X told Y that Y did something.\" This case does not make much sense, as Y would already have the information that Y did something, because it is information about themself.\nNow, consider X: \"X told Y that X did something.\" This makes sense, because X would be sharing some information about themself that Y might not have known before.\nBecause in this context, X is the chief and Y is the counselor, the answer should be the chief. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"he\" might be referring to. There are two possible referents for \"he\", namely the manager and the secretary. The verbs \"sent\" and \"reply\" might be able to help us determine which one is more likely (if either). Let X be the manager and Y the secretary. The sentence is then of the form \"X sent a message to Y, but (X or Y) didn't reply yet.\"\nLet's consider Y first: \"X sent a message to Y, but Y didn't reply yet.\" This case makes sense, because of the implicit causality of the sentence. Y was the receiver of the message, but Y didn't get back to X yet.\nNow, consider X: \"X sent a message to Y, but X didn't reply yet.\" This case doesn't make sense, because X was the initial sender of the message, so it is now Y's turn to write back to X.\nBecause in this context, X is the manager and Y is the secretary, the answer should be the secretary. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"his\" might be referring to. There are two possible referents for \"his\", namely Bailey's and the director's. The verb phrase \"plan to meet\" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form \"X will plan to meet Y at (X or Y)'s office.\"\nLet's consider Y first: \"X will plan to meet Y at Y's office.\" This case makes sense, because X might want to meet up with Y at Y's office.\nNow, consider X: \"X will plan to meet Y at X's office.\" This case also makes sense, because X might want to meet up with Y at X's own office.\nBecause both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
new file mode 100644
index 00000000..f15bda16
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": " Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: [ ; stack: [\n2: { ; stack: [ {\n3: [ ; stack: [ { [\nNow, we have reached the end. The final stack is \"[ { [\".\nWe will need to pop out \"[\", \"{\", \"[\" one by one in that order.\nSo, we need \"]\", \"}\", \"]\". So the answer is ] } ].  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: > ; stack: empty\n3: ( ; stack: (\n4: ( ; stack: ( (\n5: [ ; stack: ( ( [\n6: [ ; stack: ( ( [ [\n7: ( ; stack: ( ( [ [ (\n8: { ; stack: ( ( [ [ ( {\n9: } ; stack: ( ( [ [ (\n10: ) ; stack: ( ( [ [\n11: [ ; stack: ( ( [ [ [\n12: < ; stack: ( ( [ [ [ <\n13: > ; stack: ( ( [ [ [\n14: ] ; stack: ( ( [ [\n15: ] ; stack: ( ( [\nNow, we have reached the end. The final stack is \"( ( [\".\nWe will need to pop out \"[\", \"(\", \"(\" one by one in that order.\nSo, we need \"]\", \")\", \")\". So the answer is ] ) ).  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: [ ; stack: < [\n3: < ; stack: < [ <\n4: [ ; stack: < [ < [\n5: { ; stack: < [ < [ {\n6: < ; stack: < [ < [ { <\n7: [ ; stack: < [ < [ { < [\n8: ] ; stack: < [ < [ { <\n9: < ; stack: < [ < [ { < <\n10: { ; stack: < [ < [ { < < {\n11: } ; stack: < [ < [ { < <\n12: > ; stack: < [ < [ { <\n13: > ; stack: < [ < [ {\n14: } ; stack: < [ < [\n15: ] ; stack: < [ <\n16: > ; stack: < [\n17: { ; stack: < [ {\n18: { ; stack: < [ { {\n19: ( ; stack: < [ { { (\n20: ) ; stack: < [ { {\n21: } ; stack: < [ {\n22: { ; stack: < [ { {\n23: < ; stack: < [ { { <\n24: [ ; stack: < [ { { < [\n25: < ; stack: < [ { { < [ <\n26: > ; stack: < [ { { < [\n27: ] ; stack: < [ { { <\n28: > ; stack: < [ { {\n29: } ; stack: < [ {\nNow, we have reached the end. The final stack is \"< [ {\".\nWe will need to pop out \"{\", \"[\", \"<\" one by one in that order.\nSo, we need \"}\", \"]\", \">\". So the answer is } ] >.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
new file mode 100644
index 00000000..7e28e84e
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "formal_fallacies"
+"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
+"doc_to_text": " \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Lesley is a close friend of Fernando: Lesley = friend(Fernando).\n(2) Being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy: If X = friend(Fernando) OR SCHOOLMATE(Lowell), then X = great-grandfather(Leroy).\nHypothesis: Does it follow that Lesley is a great-grandfather of Leroy: Lesley = great-grandfather(Leroy)?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (1), we have Lesley = friend(Fernando). By (2), we have if Lesley = friend(Fernando), then Lesley = great-grandfather(Leroy).\nSo, it is true that Lesley is a great-grandfather of Leroy. So the answer is valid.  \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: Whoever is not a great-grandfather of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde. We may conclude: Everyone who is an ancestor of Dana is a stepbrother of Brian, too.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Whoever is not a great-grandfather of Clyde is a stepbrother of Brian: If X = NOT (great-grandfather(Clyde)), then X = stepbrother(Brian).\n(2): Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde: If X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\nHypothesis: Does it follow that everyone who is an ancestor of Dana is a stepbrother of Brian, too: If X = ancestor(Dana), then X = stepbrother(Brian)?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (2), we have if X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\nFurthermore, by (1), we have if X = NOT (great-grandfather(Clyde)), then X = stepbrother(Brian).\nBy the transitive relation rule in first-order logic, we then have: if X = ancestor(Dana), then X = stepbrother(Brian).\nSo, it is true that everyone who is an ancestor of Dana is a stepbrother of Brian. So the answer is valid.  \"It is not always easy to grasp who is consuming which products. The following argument pertains to this question: Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and, in the same time, a loyal buyer of Caress soap. It follows that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both: If X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress).\n(2): No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and a loyal buyer of Caress soap at the same time. If X = regular-consumer(Lush), then X = NOT (rare-consumer(Nioxin) AND loyal-buyer(Caress)).\nHypothesis: Does it follow that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap: If X = infrequent-user(Paul Mitchell), then X = NOT (regular-consumer(Lush))?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress). We need to consider both cases separately:\nThe case X = rare-consumer(Nioxin) does not appear in (2).\nThe case X = loyal-buyer(Caress) does not appear in (2), either.\nSo, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
new file mode 100644
index 00000000..07844a53
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "geometric_shapes"
+"description": "Name geometric shapes from their SVG paths.\n\n"
+"doc_to_text": " This SVG path element <path d=\"M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 9 separate commands.\n(1) M 31.00,73.00: Move the current point to 31.00,73.00.\n(2) L 32.00,59.00: Create a line from 31.00,73.00 to 32.00,59.00.\n(3) L 44.00,50.00: Create a line from 32.00,59.00 to 44.00,50.00.\n(4) L 49.00,41.00: Create a line from 44.00,50.00 to 49.00,41.00.\n(5) L 64.00,37.00: Create a line from 49.00,41.00 to 64.00,37.00.\n(6) L 71.00,55.00: Create a line from 64.00,37.00 to 71.00,55.00.\n(7) L 64.00,76.00: Create a line from 71.00,55.00 to 64.00,76.00.\n(8) L 52.00,61.00: Create a line from 64.00,76.00 to 52.00,61.00.\n(9) L 31.00,73.00: Create a line from 52.00,61.00 to 31.00,73.00.\nThis SVG path starts at point 31.00,73.00, creates eight consecutive and touching lines, and then returns back its starting point, thereby creating an eight-sided shape. It does not have any curves or arches. \"octagon\" is the only eight-sided object on the list. So the answer is (F).  This SVG path element <path d=\"M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 6 separate commands.\n(1) M 14.19,26.04: Move the current point to 14.19,26.04.\n(2) L 51.43,39.21: Create a line from 14.19,26.04 to 51.43,39.21.\n(3) L 58.44,36.69: Create a line from 51.43,39.21 to 58.44,36.69.\n(4) L 56.63,30.17: Create a line from 58.44,36.69 to 56.63,30.17.\n(5) L 48.53,26.66: Create a line from 56.63,30.17 to 48.53,26.66.\n(6) L 14.19,26.04: Create a line from 48.53,26.66 to 14.19,26.04.\nThis SVG path starts at point 14.19,26.04, creates five consecutive and touching lines, and then returns back its starting point, thereby creating a five-sided shape. It does not have any curves or arches. \"pentagon\" is the only five-sided polygon on the list. So the answer is (G).  This SVG path element <path d=\"M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 5 separate commands.\n(1) M 41.00,43.00: Move the current point to 41.00,43.00.\n(2) L 37.00,34.00: Create a line from 41.00,43.00 to 37.00,34.00.\n(3) L 41.00,33.00: Create a line from 37.00,34.00 to 41.00,33.00.\n(4) L 45.00,34.00: Create a line from 41.00,33.00 to 45.00,34.00.\n(5) L 41.00,43.00: Create a line from 45.00,34.00 to 41.00,43.00.\nThis SVG path starts at point 41.00,43.00, creates four consecutive and touching lines, and then returns back its starting point, thereby creating a four-sided shape. \"kite\" and \"rectangle\" are the only two four-sided polygons on the list. So, we need to determine which one is the correct answer.\nA kite has two pairs of equal-length adjacent sides, whereas a rectangle has two pairs of equal-length alternate (opposite) sides. Now, let's check whether the two adjacent sides of this shape are equal.\nLength of side A: |A| = sqrt((41.00-37.00)^2 + (43.00-34.00)^2) = sqrt((4)^2 + (9)^2) = sqrt(16 + 81) = sqrt(97).\nLength of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 + (1)^2) = sqrt(16 + 1) = sqrt(17).\nLength of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).\nLength of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).\nNote that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is \"kite\". So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
new file mode 100644
index 00000000..2dc997cc
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "hyperbaton"
+"description": "Order adjectives correctly in English sentences.\n\n"
+"doc_to_text": " Which sentence has the correct adjective order:\nOptions:\n(A) rubber terrible ship\n(B) terrible rubber ship\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"rubber terrible ship\". (1) rubber\" falls into the material category. (2) \"terrible\" falls into the opinion category. Option (A) has the following adjective order: [7. material] [1. opinion] (or, in numeric terms, 7 1). Because 7 < 1 is not correct, (A) does not have the correct ordering.\nOption (B): \"terrible rubber ship\". Option (B) has the following adjective order: [1. opinion] [7. material] (or, in numeric terms, 1 7). Because 1 < 7 is correct, (B) has the correct ordering. So the answer is (B).  Which sentence has the correct adjective order:\nOptions:\n(A) repulsive small Brazilian exercise ship\n(B) Brazilian repulsive exercise small ship\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"repulsive small Brazilian exercise ship\". (1) \"repulsive\" falls into the opinion category. (2) \"small\" falls into the size category. (3) \"Brazilian\" falls into the origin category. (4) \"exercise\" falls into the purpose category. Option (A) has the following adjective order: [1. opinion] [2. size] [6. origin] [8. purpose] (or, in numeric terms, 1 2 6 8). Because 1 < 2 < 6 < 8 is correct, (A) has the correct ordering.\nOption (B): \"Brazilian repulsive exercise small ship\". Option (B) has the following adjective order: [6. origin] [1. opinion] [8. purpose] [2. size] (or, in numeric terms, 6 1 8 2). Because 6 < 1 < 8 < 2 is not correct, (B) does not have the correct ordering. So the answer is (A).  Which sentence has the correct adjective order:\nOptions:\n(A) blue gold wonderful square shoe\n(B) wonderful square blue gold shoe\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"blue gold wonderful square shoe\". (1) \"blue\" falls into the color category. (2) \"gold\" falls into the material category. (3) \"wonderful\" falls into the opinion category. (4) \"square\" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.\nOption (B): \"wonderful square blue gold shoe\". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..b54bac3b
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_five_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..f4799ae6
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_seven_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..cbc0dbd1
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
new file mode 100644
index 00000000..573b3222
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": " Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:\nOptions:\n(A) Tetsuo\n(B) the Ironman\n(C) The Princess Bride\n(D) The Barkley Marathons The Race That Eats Its Young\n(E) Bug\nA: Let's think step by step.\n- Star Wars Episode IV - A New Hope (action, adventure, fantasy; 1977)\n- Indiana Jones and the Last Crusade (action, adventure; 1989)\n- Star Wars Episode V - The Empire Strikes Back (action, adventure, fantasy; 1980)\n- The Big Lebowski (action, drama, comedy; 1998)\nThese are all famous classic American movies produced before 2000. Amongst all the options, the only movie similar to these ones seems to be The Princess Bride (1987). So the answer is (C).  Find a movie similar to Twister, The Silence of the Lambs, Independence Day, Braveheart:\nOptions:\n(A) They Shoot Horses\n(B) Don't They\n(C) Forrest Gump\n(D) The Salton Sea\n(E) Extreme Days\nA: Let's think step by step.\n- Twister (action, adventure, thriller; 1996)\n- The Silence of the Lambs (crime, drama, thriller; 1991)\n- Independence Day (action, science-fiction, drama; 1996)\n- Braveheart (biography, drama, epic; 1995)\nThese are all famous Hollywood movies produced around the 1990s. Amongst all the options, the only movie similar to these ones seems to be Forrest Gump (comedy, drama, romance; 1994). So the answer is (C).  Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest Gump:\nOptions:\n(A) Phenomena\n(B) Lilting\n(C) Catwoman\n(D) Edge of Tomorrow\nA: Let's think step by step.\n- Minority Report (action, crime, mystery; 2002)\n- Total Recall (action, adventure, science-fiction; 2012)\n- Inside Out (animation, family, comedy; 2015)\n- Forrest Gump (comedy, drama, romance; 1994)\nThese are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
new file mode 100644
index 00000000..21f34e7a
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": " ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A * B\", where A = (-5 + 9 * -4 - 0) and B = (4 + -7 + 0 * -5).\nLet's calculate A = (-5 + 9 * -4 - 0) = (-5 + (9 * -4) - 0) = (-5 + (-36) - 0) = (-5 + -36 - 0) = -5 - 36 = -41.\nLet's calculate B = (4 + -7 + 0 * -5) = (4 + -7 + (0 * -5)) = (4 + -7 + 0) = (4 + -7) = (4 - 7) = -3.\nThen, the final equation is A * B = -41 * -3 = (-61) * (-3) = 123. So the answer is 123.  ((-9 * 7 * 7 * -9) + (4 * -9 - 8 - -4)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A + B\", where A = (-9 * 7 * 7 * -9) and B = (4 * -9 - 8 - -4).\nLet's calculate A = (-9 * 7 * 7 * -9) = ((-9 * 7) * (7 * -9))  = ((-63) * (-63)) = 3969.\nLet's calculate B = (4 * -9 - 8 - (-4)) = ((4 * -9) - 8 - (-4)) = ((-36) - 8 - (-4)) = ((-36 - 8) - (-4)) = (-44 - (-4)) = -40.\nThen, the final equation is A + B = 3969 + -40 = 3969 - 40 = 3929. So the answer is 3929.  ((-3 + 5 * 8 * -4) - (9 - 8 * -7 + -9)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A - B\", where A = (-3 + 5 * 8 * -4) and B = (9 - 8 * -7 + -9).\nLet's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.\nLet's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.\nThen, the final equation is A - B = -163 - 56 = -219. So the answer is -219.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
new file mode 100644
index 00000000..be3cd234
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": " If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Turn left: (0, 0), facing the negative x-axis.\n(2) Turn around: (0, 0), facing the positive x-axis.\n(3) Turn left: (0, 0), facing the positive y-axis.\n(4) Take 7 steps: (0, 7), facing the positive y-axis.\n(5) Take 2 steps: (0, 9), facing the positive y-axis.\n(6) Take 4 steps: (0, 13), facing the positive y-axis.\n(7) Take 8 steps: (0, 21), facing the positive y-axis.\nSince (0, 21) is not (0, 0), we are not where we started. So the answer is No.  If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Turn around: (0, 0), facing the negative y-axis.\n(2) Take 1 step: (0, -1), facing the negative y-axis.\n(3) Take 6 steps: (0, -7), facing the negative y-axis.\n(4) Turn around: (0, -7), facing the positive y-axis.\n(5) Take 6 steps: (0, -1), facing the positive y-axis.\n(6) Take 9 steps: (0, 8), facing the positive y-axis.\n(7) Take 1 step: (0, 9), facing the positive y-axis.\nSince (0, 9) is not (0, 0), we are not where we started. So the answer is No.  If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Always face forward: (0, 0), facing the positive y-axis.\n(2) Take 2 steps right: (0, 2), facing the positive y-axis.\n(3) Take 9 steps left: (0, -7), facing the positive y-axis.\n(4) Take 7 steps right: (0, 7), facing the positive y-axis.\nSince (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
new file mode 100644
index 00000000..767d414f
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": " I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?\nA: Let's think step by step.\nWe first identify the fruits on the list and include their quantity in parentheses:\n- blackberry (1)\n- nectarine (1)\n- plum (1)\n- strawberry (1)\n- banana (1)\n- orange (1)\nNow, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 1 + 1 = 6. So the answer is 6.  I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, a nectarine, and three plums. How many fruits do I have?\nA: Let's think step by step.\nWe first identify the fruits on the list and include their quantity in parentheses:\n- orange (1)\n- raspberry (1)\n- peaches (2)\n- blackberry (1)\n- apple (1)\n- grape (1)\n- nectarine (1)\n- plums (3)\nNow, let's add the numbers in parentheses: 1 + 1 + 2 + 1 + 1 + 1 + 1 + 3 = 11. So the answer is 11.  I have a lettuce head, a head of broccoli, an onion, a stalk of celery, two carrots, a garlic, and a yam. How many vegetables do I have?\nA: Let's think step by step.\nWe first identify the vegetables on the list and include their quantity in parentheses:\n- lettuce (1)\n- broccoli (1)\n- onion (1)\n- celery (1)\n- carrots (2)\n- garlic (1)\n- yam (1)\nNow, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
new file mode 100644
index 00000000..439c5f45
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": " Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:\nJames, 12, 90, 12\nHow many penguins are less than 8 years old?\nOptions:\n(A) 1\n(B) 2\n(C) 3\n(D) 4\n(E) 5\nA: Let's think step by step.\nThis question focuses on age. We know the following: Louis is 7 years old, Bernard is 5 years old, Vincent is 9 years old, and Gwen is 8 years old.\nNow, we add James to this table: James is 12 years old.\nThe penguins that are less than 8 years old are Louis and Bernard.\nThere are 2 penguins less than 8 years old. So the answer is (B).  Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: Let's think step by step.\nThis question focuses on age. We know the following: Louis is 7 years old, Bernard is 5 years old, Vincent is 9 years old, and Gwen is 8 years old.\nAccording to the table, Bernard (5) is the youngest amongst them.\nThe youngest penguin is Bernard. So the answer is (B).  Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: Let's think step by step.\nThis question focuses on the name. We know the following: The names of the penguin in the table are Louis, Bernard, Vincent, and Gwen.\nWhen we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.\nThe name of the second penguin sorted by alphabetical order is Gwen.\nThe name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..a93951da
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": " On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: Let's think step by step.\nAccording to this question, the color of the stress ball is blue. So the answer is (E).  On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: Let's think step by step.\nAccording to this question, the objects are arranged in a row, from left to right, as follows: (1) a purple paperclip, (2) a pink stress ball, (3) a brown keychain, (4) a green scrunchiephone charger, (5) a mauve fidget spinner, (6) a burgundy pen.\nThe stress ball is the second object on the list, namely (2). The object that is to the right of the stress ball corresponds to (3), which is a brown keychain.\nThe color of the keychain is brown. So the answer is (F).  On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?\nOptions:\n(A) zero\n(B) one\n(C) two\n(D) three\n(E) four\n(F) five\n(G) six\nA: Let's think step by step.\nAccording to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.\nThe teal plate is the first item, namely (1). There is no item to the left of the teal item.\nThe number of non-orange items to the left of the teal item is zero. So the answer is (A).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
new file mode 100644
index 00000000..d6a3485b
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": " Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?\nOptions:\n(A) whitesnape\n(B) whitesnapke\n(C) whitesnuake\n(D) mwhitesnake\nA: Let's think step by step.\nThe original name is \"whitesnake\". This is the name of an old English hard rock band. It is a compound word, formed by the words \"white\" and \"snake\".\n(A) \"whitesnape\": It is formed by the combination of \"white\" and \"snake\"; therefore, \"snake\" has been changed to \"snape\". Snape makes a reference to the fictional character Severus Snape in the Harry Potter series, so (A) is indeed a meaningful and funny edit.\n(B) \"whitesnapke\": It is formed by the combination of \"white\" and \"snapke\", but \"snapke\" is not an actual word; therefore, \"whitesnapke\" is not humorous.\n(C) \"whitesnuake\": It is formed by the combination of \"white\" and \"snuake\", but \"snuake\" is not an actual word; therefore, \"whitesnuake\" is not humorous.\n(D) \"mwhitesnake\": It is formed by the combination of \"m\", \"white\", and \"snake\", but the prefix \"-m \"seems arbitrary; therefore, \"mwhitesnake\" is not meaningful or humorous.\nAbove the above, the only humorous edit is (A). So the answer is (A).  Which of the following is a humorous edit of this artist or movie name: 'one of our dinosaurs is missing'?\nOptions:\n(A) ofne of our dinosaurs is missing\n(B) one af our dinosaurs is missing\n(C) one of our dinosaurs is pissing\n(D) one of our dinosaur is missing\nA: Let's think step by step.\nThe original name is \"one of our dinosaurs is missing\". This is the name of an old British movie.\n(A) \"ofne of our dinosaurs is missing\": Here \"one of\" is changed to \"ofne\", but the word \"ofne\" is not an actual word.\n(B) \"one af our dinosaurs is missing\": Here the word \"of\" is changed to \"af\", but the word \"af\" is not an actual word.\n(C) \"one of our dinosaurs is pissing\": Here the word \"missing\" is changed to \"pissing\", and \"one of our dinosaurs is pissing\" is indeed a very whimsical and mischievous edit. This change truly ruins the original title of the movie.\n(D) \"one of our dinosaur is missing\": Here the word \"dinosaurs\" is changed to \"dinosaur\", but \"dinosaur\" is singular but should be plural in the title; this change therefore feels arbitrary and not humorous.\nAbove the above, the only humorous edit is (C).\nAbove the above, the only humorous edit is (C). So the answer is (C).  Which of the following is a humorous edit of this artist or movie name: 'counting crows'?\nOptions:\n(A) countingy crows\n(B) counting cows\n(C) courting crows\n(D) coutnting crows\nA: Let's think step by step.\nThe original name is \"counting crows\". This is the name of an American rock band. Historically, the band name comes from the British nursery rhyme \"One for Sorrow\", which is about counting of magpies.\n(A) \"countingy crows\": Here the word \"counting\" is changed to \"countingy\", but the word \"countingy\" is not an actual word.\n(B) \"counting cows\": Here the word \"crows\" is changed to \"cows\", and this is indeed a playful and meaningful edit that ruins the original name of the band.\n(C) \"courting crows\": Here the word \"counting\" is changed to \"courting\", and \"courting\" is an actual word; however, \"courting crows\" does not sound as humorous as \"counting cows\".\n(D) \"coutnting crows\": Here the word \"counting\" is changed to \"coutnting\", but the word \"coutnting\" is not an actual word.\nAbove the above, the only humorous edit is (B). So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..2aa42072
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": " The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.\nTranslation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"The list of monuments in Lenzen (Elbe) includes all the monuments in the Brandenburg town of Lenzen (Elbe) and its districts.\" On the other hand, the provided translation is \"In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\" Note that Lenzen (Elbe) is changed to Lenzen in the original translation; so, there is a named entity error. Because an entity in the original source sentence is changed to a different entity in the translation, the translation contains an error pertaining to Named Entities. So the answer is (D).  The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Auf dieser Seite sind die Baudenkmäler der oberbayerischen Großen Kreisstadt Landsberg am Lech zusammengestellt.\nTranslation: On this page are compiled the architectural monuments of the town of Landsberg am Lech.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"The monuments of the Upper Bavarian district town of Landsberg am Lech are compiled on this page.\" On the other hand, the provided translation is \"On this page are compiled the architectural monuments of the town of Landsberg am Lech.\" Note that an important detail about the location of Landsberg am Lech is omitted in the original translation: The translation should have said \"Upper Bavarian district town of Landsberg am Lech\". Because a significant clause in the translation was removed, the translation contains an error pertaining to Dropped Content. So the answer is (E).  The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Łeba ist eine Kleinstadt und ein Badeort im Powiat Lęborski der polnischen Woiwodschaft Pommern.\nTranslation: Eba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship.\" On the other hand, the provided translation is \"Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\" Note that the provided sentence says, \"Łeba is not a small town ...\" However, the translation should have been \"Łeba is a small town ...\" Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
new file mode 100644
index 00000000..0138faa7
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
+"doc_to_text": " Which statement is sarcastic?\nOptions:\n(A) Yes, because having interests and actively researching them is a huge waste\n(B) Yes, because having interests and actively researching them is a huge deal\nA: Let's think step by step.\nIf we look at (A), it says that having interests and actively researching them is a huge waste, implying that it is a useless effort. However, we know that having interests and actively researching them is typically not a waste but rather is beneficial to the individual. The presence of such a juxtaposition in (A) suggests that it contains a taste of irony and sarcasm.\nIf we look at (B), it says that having interests and actively researching them is a huge deal, implying that it is an important and consequential effort. This is arguably a neutral and correct statement.\nAbove the above, the sarcastic option is (A). So the answer is (A).  Which statement is sarcastic?\nOptions:\n(A) No one is going to disagree with you on this. Avoiding ad hominem attacks really help your case\n(B) No one is going to disagree with you on this. Ad hominem attacks really help your case\nA: Let's think step by step.\nIf we look at (A), it says that avoiding ad hominem attacks really help your case, implying that ad hominem attacks are adverse and injurious. Because ad hominem attacks are adressed at a person rather than an idea, it is indeed true that avoiding them is often useful and helpful; so, (A) is a neutral (valid and agreeable) statement.\nIf we look at (B), it says that ad hominem attacks really help your case, implying that ad hominem attacks are a positive thing. However, we stated previously that ad hominem attacks are often not useful or constructive. The speaker in this sentence therefore seems to mean the opposite of what they are saying; so, there appears to have a taste of irony and sarcasm in (B).\nAbove the above, the sarcastic option is (B). So the answer is (B).  Which statement is sarcastic?\nOptions:\n(A) Consistency in the league's punishments? What do you think this is supposed to be, politics?\n(B) Consistency in the league's punishments? What do you think this is supposed to be, moral?\nA: Let's think step by step.\nIf we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.\nIf we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.\nAbove the above, the sarcastic option is (A). So the answer is (A).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
new file mode 100644
index 00000000..90c0f191
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": " Is the following sentence plausible? \"Bam Adebayo scored a reverse layup in the Western Conference Finals.\"\nA: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.  Is the following sentence plausible? \"Santi Cazorla scored a touchdown.\"\nA: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part of American football and rugby. So the answer is no.  Is the following sentence plausible? \"DeMar DeRozan was called for the goal tend.\"\nA: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
new file mode 100644
index 00000000..cff56746
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": " Today, Emily went to the museum. Between what times could they have gone?\nWe know that:\nEmily woke up at 1pm.\nElizabeth saw Emily reading at the library from 2pm to 4pm.\nJessica saw Emily watching a movie at the theater from 4pm to 5pm.\nLeslie saw Emily waiting at the airport from 5pm to 6pm.\nWilliam saw Emily buying clothes at the mall from 6pm to 7pm.\nThe museum was closed after 7pm.\nBetween what times could Emily have gone to the museum?\nOptions:\n(A) 1pm to 2pm\n(B) 6pm to 7pm\n(C) 5pm to 6pm\n(D) 2pm to 4pm\nA: Let's think step by step.\nWake-up time: 1pm.\n1pm-2pm: free.\n2pm-4pm: reading at the library.\n4pm-5pm: watching a movie at the theater.\n5pm-6pm: waiting at the airport.\n6pm-7pm: buying clothes at the mall.\nThe museum closure time: 7pm.\nThe only time when Emily could have gone to the museum was 1pm to 2pm. So the answer is (A).  Today, Elizabeth went to the amusement park. Between what times could they have gone?\nWe know that:\nElizabeth woke up at 7am.\nDavid saw Elizabeth fixing their computer at the electronic store from 1pm to 2pm.\nSarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.\nSusan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.\nAndrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.\nEmily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.\nThe amusement park was closed after 10pm.\nBetween what times could Elizabeth have gone to the amusement park?\nOptions:\n(A) 7am to 1pm\n(B) 9pm to 10pm\n(C) 1pm to 2pm\n(D) 3pm to 6pm\nA: Let's think step by step.\nWake-up time: 7am.\n7am-1pm: free.\n1pm-2pm: fixing their computer at the electronic store.\n2pm-3pm: playing tennis at the tennis court.\n3pm-6pm: walking towards the Statue of Liberty.\n6pm-9pm: taking photos near the Eiffel Tower.\n9pm-10pm: getting a coffee at the cafe.\nThe amusement park closure time: 10pm.\nThe only time when Elizabeth could have gone to the amusement park was 7am to 1pm. So the answer is (A).  Today, Tiffany went to the beach. Between what times could they have gone?\nWe know that:\nTiffany woke up at 5am.\nBetty saw Tiffany getting a coffee at the cafe from 5am to 6am.\nJessica saw Tiffany working at the office from 6am to 9am.\nJohn saw Tiffany stretching at a yoga studio from 9am to 12pm.\nSean saw Tiffany sitting on a rooftop from 12pm to 2pm.\nSarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.\nThe beach was closed after 4pm.\nBetween what times could Tiffany have gone to the beach?\nOptions:\n(A) 9am to 12pm\n(B) 12pm to 2pm\n(C) 5am to 6am\n(D) 3pm to 4pm\nA: Let's think step by step.\nWake-up time: 5am.\n5am-6am: getting a coffee at the cafe.\n6am-9am: working at the office.\n9am-12pm: stretching at a yoga studio.\n12pm-2pm: sitting on a rooftop.\n2pm-3pm: playing tennis at the tennis court.\n3pm-4pm: free.\nThe beach closure time: 4pm.\nThe only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..a4fd3995
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..28ff5389
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..84526656
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
new file mode 100644
index 00000000..69857561
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": " Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?\nA: Let's think step by step.\n(1) Fidel tells the truth. So, we know that Fidel tells the truth.\n(2) Jerry says Fidel tells the truth. Since we know from (1) that Fidel tells the truth, if Jerry says that Fidel tells the truth, then Jerry tells the truth.\n(3) Vina says Jerry tells the truth. Since we know from (2) that Jerry tells the truth, if Vina says Jerry tells the truth, then Vine tells the truth.\n(4) Millicent says Vina lies. Since we know from (3) that Vina tells the truth, if Millicent says Vina lies, then Millicent lies.\n(5) Raymond says Millicent lies. Since we know from (4) that Millicent lies, if Raymond says Millicent lies, then Raymond tells the truth.\nNow, the question asks: Does Raymond tell the truth? We know from (5) that Raymond tells the truth. So the answer is Yes.  Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell the truth?\nA: Let's think step by step.\n(1) Kristian lies. So, we know that Kristian lies.\n(2) Millie says Kristian lies. Since we know from (1) that Kristian lies, if Millie says Kristian lies, then Millie tells the truth.\n(3) Maybelle says Millie tells the truth. Since we know from (2) that Millie tells the truth, if Maybelle says Millie tells the truth, then Maybelle tells the truth.\n(4) Fidel says Maybelle lies. Since we know from (3) that Maybelle tells the truth, if Fidel says Maybelle lies, then Fidel lies.\n(5) Leda says Fidel lies. Since we know from (4) that Fidel lies, if Leda says Fidel lies, then Leda tells the truth.\nNow, the question asks: Does Leda tell the truth? We know from (5) that Leda tells the truth. So the answer is Yes.  Question: Kristian tells the truth. Michaela says Kristian lies. Raymond says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says Osvaldo tells the truth. Does Jamey tell the truth?\nA: Let's think step by step.\n(1) Kristian tells the truth. So, we know that Kristian tells the truth.\n(2) Michaela says Kristian lies. Since we know from (1) that Kristian tells the truth, if Michaela says Kristian lies, then Michaela lies.\n(3) Raymond says Michaela tells the truth. Since we know from (2) that Michaela lies, if Raymond says Michaela tells the truth, then Raymond lies.\n(4) Osvaldo says Raymond tells the truth. Since we know from (3) that Raymond lies, if Osvaldo says Raymond tells the truth, then Osvaldo lies.\n(5) Jamey says Osvaldo tells the truth. Since we know from (4) that Osvaldo lies, if Jamey says Osvaldo tells the truth, then Jamey lies.\nNow, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml
new file mode 100644
index 00000000..0e2e1a88
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": " Sort the following words alphabetically: List: oven costume counterpart\nA: Let's think step by step.\nThe first letter: \"oven\": \"o\" (15). \"costume\": \"c\" (3). \"counterpart\": \"c\" (3). We now have: (3) [\"costume\" ? \"counterpart\"] < (15) \"oven\". Now let's sort this subpart [\"costume\" ? \"counterpart\"] by looking at their second letters.\nThe second letter: \"costume\": \"o\" (15). \"counterpart\": \"o\" (15). We now have: (15) [\"costume\" ? \"counterpart\"]. Now let's sort this subpart [\"costume\" ? \"counterpart\"] by looking at their third letters.\nThe third letter: \"costume\": \"s\" (19). \"counterpart\": \"u\" (21). We now have: (19) \"costume\" < (21) \"counterpart\". Hence, we have [\"costume\" < \"counterpart\"] < \"oven\". So the answer is costume counterpart oven.  Sort the following words alphabetically: List: hypochlorite ponderosa phone credulity\nA: Let's think step by step.\nThe first letter: \"hypochlorite\": \"h\" (8). \"ponderosa\": \"p\" (16). \"phone\": \"p\" (16). \"credulity\": \"c\" (3). We now have: (3) \"credulity\" < (8) \"hypochlorite\" < (16) [\"ponderosa\" ? \"phone\"]. Now let's sort this subpart [\"ponderosa\" ? \"phone\"] by looking at their second letters.\nThe second letter: \"ponderosa\": \"o\" (15). \"phone\": \"h\" (8). We now have: (8) \"phone\" < (15) \"ponderosa\". Hence, we have \"credulity\" < \"hypochlorite\" < [\"phone\" <\"ponderosa\"]. So the answer is credulity hypochlorite phone ponderosa.  Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census\nA: Let's think step by step.\nThe first letter: \"newt\": \"n\" (14). \"arson\": \"a\" (1). \"parthia\": \"p\" (16). \"seismography\": \"s\" (19). \"mugho\": \"m\" (13). \"aspect\": \"a\" (1). \"census\": \"c\" (3). We now have: (1) [\"arson\" ? \"aspect\"] < (3) \"census\" < (13) \"mugho\" < (14) \"newt\" < (16) \"parthia\" < (19) \"seismography\". Now let's sort this subpart [\"arson\" ? \"aspect\"] by looking at their second letters.\nThe second letter: \"arson\": \"r\" (18). \"aspect\": \"s\" (19). We now have: (18) \"arson\" < (19) \"aspect\". Hence, we have [\"arson\" < \"aspect\"] < \"census\" < \"mugho\" < \"newt\" < \"parthia\" < \"seismography\". So the answer is arson aspect census mugho newt parthia seismography.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_fewshot_word_sorting"
-- 
GitLab


From c06b0d6efb48abe7f171d283ffad765ee204f72a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:40:18 +0000
Subject: [PATCH 026/212] add flan_cot_zeroshot

---
 .../_flan_cot_zeroshot_template_yaml          | 22 +++++++++++++++++++
 .../boolean_expressions.yaml                  |  5 +++++
 .../flan_cot_zeroshot/causal_judgement.yaml   |  5 +++++
 .../flan_cot_zeroshot/date_understanding.yaml |  5 +++++
 .../flan_cot_zeroshot/disambiguation_qa.yaml  |  5 +++++
 .../bbh/flan_cot_zeroshot/dyck_languages.yaml |  5 +++++
 .../flan_cot_zeroshot/formal_fallacies.yaml   |  5 +++++
 .../flan_cot_zeroshot/geometric_shapes.yaml   |  5 +++++
 .../bbh/flan_cot_zeroshot/hyperbaton.yaml     |  5 +++++
 .../logical_deduction_five_objects.yaml       |  5 +++++
 .../logical_deduction_seven_objects.yaml      |  5 +++++
 .../logical_deduction_three_objects.yaml      |  5 +++++
 .../movie_recommendation.yaml                 |  5 +++++
 .../multistep_arithmetic_two.yaml             |  5 +++++
 .../tasks/bbh/flan_cot_zeroshot/navigate.yaml |  5 +++++
 .../flan_cot_zeroshot/object_counting.yaml    |  5 +++++
 .../penguins_in_a_table.yaml                  |  5 +++++
 .../reasoning_about_colored_objects.yaml      |  5 +++++
 .../bbh/flan_cot_zeroshot/ruin_names.yaml     |  5 +++++
 .../salient_translation_error_detection.yaml  |  5 +++++
 .../tasks/bbh/flan_cot_zeroshot/snarks.yaml   |  5 +++++
 .../sports_understanding.yaml                 |  5 +++++
 .../flan_cot_zeroshot/temporal_sequences.yaml |  5 +++++
 ...racking_shuffled_objects_five_objects.yaml |  5 +++++
 ...acking_shuffled_objects_seven_objects.yaml |  5 +++++
 ...acking_shuffled_objects_three_objects.yaml |  5 +++++
 .../bbh/flan_cot_zeroshot/web_of_lies.yaml    |  5 +++++
 .../bbh/flan_cot_zeroshot/word_sorting.yaml   |  5 +++++
 28 files changed, 157 insertions(+)
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml

diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
new file mode 100644
index 00000000..0f6fc880
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -0,0 +1,22 @@
+group: bbh_flan_zeroshot
+dataset_path: lukaemon/bbh
+output_type: greedy_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
new file mode 100644
index 00000000..04a6d1e5
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
new file mode 100644
index 00000000..73ed31c8
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
new file mode 100644
index 00000000..11ad75db
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
new file mode 100644
index 00000000..b8b13d78
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
new file mode 100644
index 00000000..5995b6c4
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
new file mode 100644
index 00000000..6b029e7e
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "formal_fallacies"
+"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
new file mode 100644
index 00000000..acb91aa4
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "geometric_shapes"
+"description": "Name geometric shapes from their SVG paths.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
new file mode 100644
index 00000000..dbe1280b
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "hyperbaton"
+"description": "Order adjectives correctly in English sentences.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..5592252a
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_five_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..c85b9d21
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_seven_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..e94f8e4a
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
new file mode 100644
index 00000000..038119fc
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
new file mode 100644
index 00000000..c21c1b8c
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
new file mode 100644
index 00000000..c8ea0681
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
new file mode 100644
index 00000000..6d6a4721
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
new file mode 100644
index 00000000..c5501700
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..26789385
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
new file mode 100644
index 00000000..3289b750
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..c8113e62
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
new file mode 100644
index 00000000..b9da41c7
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
new file mode 100644
index 00000000..dbf21164
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
new file mode 100644
index 00000000..84db7993
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..4b6ec1ad
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..99dbcc33
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..4f9ff8e7
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
new file mode 100644
index 00000000..5304cdfc
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
new file mode 100644
index 00000000..62f0a6aa
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_template_yaml"
+"task": "bbh_flan_cot_zeroshot_word_sorting"
-- 
GitLab


From 3531d9c162b58a92aa441111750b516d52b40a19 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:40:45 +0000
Subject: [PATCH 027/212] add flan_fewshot

---
 .../bbh/flan_fewshot/_flan_fewshot_template_yaml | 16 ++++++++++++++++
 .../bbh/flan_fewshot/boolean_expressions.yaml    |  5 +++++
 .../tasks/bbh/flan_fewshot/causal_judgement.yaml |  5 +++++
 .../bbh/flan_fewshot/date_understanding.yaml     |  5 +++++
 .../bbh/flan_fewshot/disambiguation_qa.yaml      |  5 +++++
 .../tasks/bbh/flan_fewshot/dyck_languages.yaml   |  5 +++++
 .../tasks/bbh/flan_fewshot/formal_fallacies.yaml |  5 +++++
 .../tasks/bbh/flan_fewshot/geometric_shapes.yaml |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml   |  5 +++++
 .../logical_deduction_five_objects.yaml          |  5 +++++
 .../logical_deduction_seven_objects.yaml         |  5 +++++
 .../logical_deduction_three_objects.yaml         |  5 +++++
 .../bbh/flan_fewshot/movie_recommendation.yaml   |  5 +++++
 .../flan_fewshot/multistep_arithmetic_two.yaml   |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/navigate.yaml     |  5 +++++
 .../tasks/bbh/flan_fewshot/object_counting.yaml  |  5 +++++
 .../bbh/flan_fewshot/penguins_in_a_table.yaml    |  5 +++++
 .../reasoning_about_colored_objects.yaml         |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml   |  5 +++++
 .../salient_translation_error_detection.yaml     |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/snarks.yaml       |  5 +++++
 .../bbh/flan_fewshot/sports_understanding.yaml   |  5 +++++
 .../bbh/flan_fewshot/temporal_sequences.yaml     |  5 +++++
 .../tracking_shuffled_objects_five_objects.yaml  |  5 +++++
 .../tracking_shuffled_objects_seven_objects.yaml |  5 +++++
 .../tracking_shuffled_objects_three_objects.yaml |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml  |  5 +++++
 lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml |  5 +++++
 28 files changed, 151 insertions(+)
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml

diff --git a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
new file mode 100644
index 00000000..ff53aabc
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
@@ -0,0 +1,16 @@
+group: bbh_flan_fewshot
+dataset_path: lukaemon/bbh
+output_type: greedy_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
diff --git a/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
new file mode 100644
index 00000000..19d24f3f
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: not ( ( not not True ) ) is\nA: False\n\nQ: True and False and not True and True is\nA: False\n\nQ: not not ( not ( False ) ) is\nA: True\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
new file mode 100644
index 00000000..b9dd8f6e
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": "Q: How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: No\n\nQ: How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
new file mode 100644
index 00000000..5ed01c22
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": "Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: (D)\n\nQ: Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: (B)\n\nQ: Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: (B)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
new file mode 100644
index 00000000..0c04056f
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": "Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
new file mode 100644
index 00000000..84e308cf
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: ] } ]\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: ] ) )\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: } ] >\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
new file mode 100644
index 00000000..c91769a5
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "formal_fallacies"
+"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
+"doc_to_text": "Q: \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: valid\n\nQ: \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: Whoever is not a great-grandfather of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde. We may conclude: Everyone who is an ancestor of Dana is a stepbrother of Brian, too.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: valid\n\nQ: \"It is not always easy to grasp who is consuming which products. The following argument pertains to this question: Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and, in the same time, a loyal buyer of Caress soap. It follows that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: invalid\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
new file mode 100644
index 00000000..1dbb242f
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "geometric_shapes"
+"description": "Name geometric shapes from their SVG paths.\n\n"
+"doc_to_text": "Q: This SVG path element <path d=\"M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (F)\n\nQ: This SVG path element <path d=\"M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (G)\n\nQ: This SVG path element <path d=\"M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (D)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
new file mode 100644
index 00000000..090865b2
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "hyperbaton"
+"description": "Order adjectives correctly in English sentences.\n\n"
+"doc_to_text": "Q: Which sentence has the correct adjective order:\nOptions:\n(A) rubber terrible ship\n(B) terrible rubber ship\nA: (B)\n\nQ: Which sentence has the correct adjective order:\nOptions:\n(A) repulsive small Brazilian exercise ship\n(B) Brazilian repulsive exercise small ship\nA: (A)\n\nQ: Which sentence has the correct adjective order:\nOptions:\n(A) blue gold wonderful square shoe\n(B) wonderful square blue gold shoe\nA: (B)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..67f2c1a7
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_five_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..47593a0d
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_seven_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..7264e653
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
new file mode 100644
index 00000000..8fb208b0
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": "Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:\nOptions:\n(A) Tetsuo\n(B) the Ironman\n(C) The Princess Bride\n(D) The Barkley Marathons The Race That Eats Its Young\n(E) Bug\nA: (C)\n\nQ: Find a movie similar to Twister, The Silence of the Lambs, Independence Day, Braveheart:\nOptions:\n(A) They Shoot Horses\n(B) Don't They\n(C) Forrest Gump\n(D) The Salton Sea\n(E) Extreme Days\nA: (C)\n\nQ: Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest Gump:\nOptions:\n(A) Phenomena\n(B) Lilting\n(C) Catwoman\n(D) Edge of Tomorrow\nA: (D)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
new file mode 100644
index 00000000..ba5f65ca
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": "Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =\nA: 123\n\nQ: ((-9 * 7 * 7 * -9) + (4 * -9 - 8 - -4)) =\nA: 3929\n\nQ: ((-3 + 5 * 8 * -4) - (9 - 8 * -7 + -9)) =\nA: -219\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml b/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
new file mode 100644
index 00000000..0aba5820
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\nOptions:\n- Yes\n- No\nA: No\n\nQ: If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\nOptions:\n- Yes\n- No\nA: No\n\nQ: If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
new file mode 100644
index 00000000..7aa27a38
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?\nA: 6\n\nQ: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, a nectarine, and three plums. How many fruits do I have?\nA: 11\n\nQ: I have a lettuce head, a head of broccoli, an onion, a stalk of celery, two carrots, a garlic, and a yam. How many vegetables do I have?\nA: 8\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
new file mode 100644
index 00000000..f91d9c98
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:\nJames, 12, 90, 12\nHow many penguins are less than 8 years old?\nOptions:\n(A) 1\n(B) 2\n(C) 3\n(D) 4\n(E) 5\nA: (B)\n\nQ: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: (B)\n\nQ: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: (D)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..d03dcd07
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: (E)\n\nQ: On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: (F)\n\nQ: On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?\nOptions:\n(A) zero\n(B) one\n(C) two\n(D) three\n(E) four\n(F) five\n(G) six\nA: (A)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
new file mode 100644
index 00000000..d12013db
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": "Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?\nOptions:\n(A) whitesnape\n(B) whitesnapke\n(C) whitesnuake\n(D) mwhitesnake\nA: (A)\n\nQ: Which of the following is a humorous edit of this artist or movie name: 'one of our dinosaurs is missing'?\nOptions:\n(A) ofne of our dinosaurs is missing\n(B) one af our dinosaurs is missing\n(C) one of our dinosaurs is pissing\n(D) one of our dinosaur is missing\nA: (C)\n\nQ: Which of the following is a humorous edit of this artist or movie name: 'counting crows'?\nOptions:\n(A) countingy crows\n(B) counting cows\n(C) courting crows\n(D) coutnting crows\nA: (B)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..bfbcfa35
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": "Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.\nTranslation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (D)\n\nQ: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Auf dieser Seite sind die Baudenkmäler der oberbayerischen Großen Kreisstadt Landsberg am Lech zusammengestellt.\nTranslation: On this page are compiled the architectural monuments of the town of Landsberg am Lech.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (E)\n\nQ: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Łeba ist eine Kleinstadt und ein Badeort im Powiat Lęborski der polnischen Woiwodschaft Pommern.\nTranslation: Eba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml b/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
new file mode 100644
index 00000000..375da1a2
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
+"doc_to_text": "Q: Which statement is sarcastic?\nOptions:\n(A) Yes, because having interests and actively researching them is a huge waste\n(B) Yes, because having interests and actively researching them is a huge deal\nA: (A)\n\nQ: Which statement is sarcastic?\nOptions:\n(A) No one is going to disagree with you on this. Avoiding ad hominem attacks really help your case\n(B) No one is going to disagree with you on this. Ad hominem attacks really help your case\nA: (B)\n\nQ: Which statement is sarcastic?\nOptions:\n(A) Consistency in the league's punishments? What do you think this is supposed to be, politics?\n(B) Consistency in the league's punishments? What do you think this is supposed to be, moral?\nA: (A)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
new file mode 100644
index 00000000..f72cdba9
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": "Q: Is the following sentence plausible? \"Bam Adebayo scored a reverse layup in the Western Conference Finals.\"\nA: yes\n\nQ: Is the following sentence plausible? \"Santi Cazorla scored a touchdown.\"\nA: no\n\nQ: Is the following sentence plausible? \"DeMar DeRozan was called for the goal tend.\"\nA: yes\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
new file mode 100644
index 00000000..0f5e5380
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: Today, Emily went to the museum. Between what times could they have gone?\nWe know that:\nEmily woke up at 1pm.\nElizabeth saw Emily reading at the library from 2pm to 4pm.\nJessica saw Emily watching a movie at the theater from 4pm to 5pm.\nLeslie saw Emily waiting at the airport from 5pm to 6pm.\nWilliam saw Emily buying clothes at the mall from 6pm to 7pm.\nThe museum was closed after 7pm.\nBetween what times could Emily have gone to the museum?\nOptions:\n(A) 1pm to 2pm\n(B) 6pm to 7pm\n(C) 5pm to 6pm\n(D) 2pm to 4pm\nA: (A)\n\nQ: Today, Elizabeth went to the amusement park. Between what times could they have gone?\nWe know that:\nElizabeth woke up at 7am.\nDavid saw Elizabeth fixing their computer at the electronic store from 1pm to 2pm.\nSarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.\nSusan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.\nAndrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.\nEmily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.\nThe amusement park was closed after 10pm.\nBetween what times could Elizabeth have gone to the amusement park?\nOptions:\n(A) 7am to 1pm\n(B) 9pm to 10pm\n(C) 1pm to 2pm\n(D) 3pm to 6pm\nA: (A)\n\nQ: Today, Tiffany went to the beach. Between what times could they have gone?\nWe know that:\nTiffany woke up at 5am.\nBetty saw Tiffany getting a coffee at the cafe from 5am to 6am.\nJessica saw Tiffany working at the office from 6am to 9am.\nJohn saw Tiffany stretching at a yoga studio from 9am to 12pm.\nSean saw Tiffany sitting on a rooftop from 12pm to 2pm.\nSarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.\nThe beach was closed after 4pm.\nBetween what times could Tiffany have gone to the beach?\nOptions:\n(A) 9am to 12pm\n(B) 12pm to 2pm\n(C) 5am to 6am\n(D) 3pm to 4pm\nA: (D)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..112ede19
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..83821c54
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..afee9bb9
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
new file mode 100644
index 00000000..2f1c5686
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?\nA: Yes\n\nQ: Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell the truth?\nA: Yes\n\nQ: Question: Kristian tells the truth. Michaela says Kristian lies. Raymond says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says Osvaldo tells the truth. Does Jamey tell the truth?\nA: No\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
new file mode 100644
index 00000000..43b7d43d
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: Sort the following words alphabetically: List: oven costume counterpart\nA: costume counterpart oven\n\nQ: Sort the following words alphabetically: List: hypochlorite ponderosa phone credulity\nA: credulity hypochlorite phone ponderosa\n\nQ: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census\nA: arson aspect census mugho newt parthia seismography\n\nQ: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_fewshot_word_sorting"
-- 
GitLab


From 0d195e90f2796069f6c1d92e7f19854a66b11245 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:41:10 +0000
Subject: [PATCH 028/212] add flan_zeroshot

---
 .../flan_zeroshot/_flan_zeroshot_template_yaml   | 16 ++++++++++++++++
 .../bbh/flan_zeroshot/boolean_expressions.yaml   |  5 +++++
 .../bbh/flan_zeroshot/causal_judgement.yaml      |  5 +++++
 .../bbh/flan_zeroshot/date_understanding.yaml    |  5 +++++
 .../bbh/flan_zeroshot/disambiguation_qa.yaml     |  5 +++++
 .../tasks/bbh/flan_zeroshot/dyck_languages.yaml  |  5 +++++
 .../bbh/flan_zeroshot/formal_fallacies.yaml      |  5 +++++
 .../bbh/flan_zeroshot/geometric_shapes.yaml      |  5 +++++
 lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml  |  5 +++++
 .../logical_deduction_five_objects.yaml          |  5 +++++
 .../logical_deduction_seven_objects.yaml         |  5 +++++
 .../logical_deduction_three_objects.yaml         |  5 +++++
 .../bbh/flan_zeroshot/movie_recommendation.yaml  |  5 +++++
 .../flan_zeroshot/multistep_arithmetic_two.yaml  |  5 +++++
 lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml    |  5 +++++
 .../tasks/bbh/flan_zeroshot/object_counting.yaml |  5 +++++
 .../bbh/flan_zeroshot/penguins_in_a_table.yaml   |  5 +++++
 .../reasoning_about_colored_objects.yaml         |  5 +++++
 lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml  |  5 +++++
 .../salient_translation_error_detection.yaml     |  5 +++++
 lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml      |  5 +++++
 .../bbh/flan_zeroshot/sports_understanding.yaml  |  5 +++++
 .../bbh/flan_zeroshot/temporal_sequences.yaml    |  5 +++++
 .../tracking_shuffled_objects_five_objects.yaml  |  5 +++++
 .../tracking_shuffled_objects_seven_objects.yaml |  5 +++++
 .../tracking_shuffled_objects_three_objects.yaml |  5 +++++
 lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml |  5 +++++
 .../tasks/bbh/flan_zeroshot/word_sorting.yaml    |  5 +++++
 28 files changed, 151 insertions(+)
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml

diff --git a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
new file mode 100644
index 00000000..832c728d
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
@@ -0,0 +1,16 @@
+group: bbh_flan_zeroshot
+dataset_path: lukaemon/bbh
+output_type: greedy_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
new file mode 100644
index 00000000..7098c7e3
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
new file mode 100644
index 00000000..953419bd
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
new file mode 100644
index 00000000..99255c90
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
new file mode 100644
index 00000000..65c515cc
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
new file mode 100644
index 00000000..10b87a70
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
new file mode 100644
index 00000000..7fcf6920
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "formal_fallacies"
+"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
new file mode 100644
index 00000000..ee6082b9
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "geometric_shapes"
+"description": "Name geometric shapes from their SVG paths.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
new file mode 100644
index 00000000..3e82c854
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "hyperbaton"
+"description": "Order adjectives correctly in English sentences.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..1e4adeb4
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_five_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..910ca139
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_seven_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..405cf023
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
new file mode 100644
index 00000000..54dc45f3
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
new file mode 100644
index 00000000..494b94fe
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
new file mode 100644
index 00000000..3f107003
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
new file mode 100644
index 00000000..8e1a675e
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
new file mode 100644
index 00000000..c3c4138c
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..bbe01119
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
new file mode 100644
index 00000000..b43e9414
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..43ebe9f0
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml b/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
new file mode 100644
index 00000000..df46e580
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
new file mode 100644
index 00000000..fdbc3287
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
new file mode 100644
index 00000000..4a526778
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..39d96c56
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..c1f42e8f
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..0e02323d
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
new file mode 100644
index 00000000..179aab6a
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
new file mode 100644
index 00000000..9317b875
--- /dev/null
+++ b/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_template_yaml"
+"task": "bbh_flan_zeroshot_word_sorting"
-- 
GitLab


From 86e785899d703bd2edc37df2e7e872220f3fb2cf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:41:57 +0000
Subject: [PATCH 029/212] modified changes to fix loglikelihood prediction for
 seq2seq

---
 lm_eval/models/huggingface.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 65479d40..4b509352 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -409,12 +409,13 @@ class HFLM(LM):
         utils.clear_torch_cache()
         return batch_size
 
-    def tok_encode(self, string: str, left_truncate_len=None):
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
         """ """
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            add_special_tokens = False
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            add_special_tokens = True
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                add_special_tokens = False
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                add_special_tokens = True
 
         encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
 
@@ -529,8 +530,12 @@ class HFLM(LM):
         if n_spaces > 0:
             continuation = context[-n_spaces:] + continuation
             context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
+
+        whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
+        context_enc = self.tok_encode(context, add_special_tokens=False)
+
+        # whole_enc = self.tok_encode(context + continuation)
+        # context_enc = self.tok_encode(context, add_special_tokens=False)
         context_enc_len = len(context_enc)
         continuation_enc = whole_enc[context_enc_len:]
         return context_enc, continuation_enc
-- 
GitLab


From 96d9e38c5e302b5c4dcddae2542a9ec10333f057 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:50:20 +0000
Subject: [PATCH 030/212] update benchmarks

---
 .../flan/prompt_templates/flan_arc.yaml       | 24 +++++++++++++++++++
 lm_eval/benchmarks/flan_held_out.yaml         | 12 ++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml

diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
new file mode 100644
index 00000000..c9135a51
--- /dev/null
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
@@ -0,0 +1,24 @@
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-1":
+    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-2":
+    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-3":
+    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-4":
+    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-5":
+    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+  "template-6":
+    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+
diff --git a/lm_eval/benchmarks/flan_held_out.yaml b/lm_eval/benchmarks/flan_held_out.yaml
index cde82722..f61affea 100644
--- a/lm_eval/benchmarks/flan_held_out.yaml
+++ b/lm_eval/benchmarks/flan_held_out.yaml
@@ -1,4 +1,12 @@
 group: flan_held_out
 task:
-  - bbh_flan
-  - mmlu_flan
+  # BBH
+  - bbh_flan_zeroshot
+  - bbh_flan_fewshot
+  - bbh_flan_cot_fewshot
+  - bbh_flan_cot_zeroshot
+  # MMLU
+  - mmlu_flan_n_shot_generative
+  - mmlu_flan_n_shot_loglikelihood
+  - mmlu_flan_cot_zeroshot
+  - mmlu_flan_cot_fewshot
-- 
GitLab


From c8b76a3dad9fb9c3c5d89db695b2b758e223f31c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:50:50 +0000
Subject: [PATCH 031/212] edit group name

---
 .../tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml | 3 +--
 .../mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index 3f649666..c52f8dc7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -1,6 +1,5 @@
-group: mmlu_flan
+group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
-# validation_split: validation
 test_split: test
 fewshot_split: dev
 # doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 2a09f787..2162ade8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -1,4 +1,4 @@
-group: mmlu_flan_loglikelihood
+group: mmlu_flan_n_shot_loglikelihood
 dataset_path: cais/mmlu
 # validation_split: validation
 test_split: test
-- 
GitLab


From e795efcfacb1f16e0ba79dee221ff18e44906f52 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 10:51:16 +0000
Subject: [PATCH 032/212] updates

---
 lm_eval/tasks/bbh/README.md                   |  2 +-
 lm_eval/tasks/bbh/_generate_configs.py        | 92 +++++++++++++++----
 lm_eval/tasks/bbh/_template_yaml              | 15 ---
 .../_flan_cot_fewshot_template_yaml           |  2 +-
 .../_flan_cot_zeroshot_template_yaml          |  2 +-
 5 files changed, 75 insertions(+), 38 deletions(-)
 delete mode 100644 lm_eval/tasks/bbh/_template_yaml

diff --git a/lm_eval/tasks/bbh/README.md b/lm_eval/tasks/bbh/README.md
index 91be60fc..4f8dad49 100644
--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -25,7 +25,7 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
 
 #### Groups
 
-- `bbh`
+- `bbh_flan_zeroshot`
 
 #### Tasks
 
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index 9e603994..8d805a64 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -1,32 +1,84 @@
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import re
 import yaml
-import inspect
-import datasets
+import requests
+import argparse
 
+import datasets
 from tqdm import tqdm
 
+from lm_eval import utils
+from lm_eval.logger import eval_logger
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument(
+        "--save_prefix_path", default="flan_zeroshot"
+    )
+    parser.add_argument(
+        "--cot", default=False
+    )
+    parser.add_argument(
+        "--fewshot", default=False
+    )
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
 
-def main() -> None:
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    base_doc_to_text = "Q: {{input}}\nA:"
+    answer_regex = re.compile("(?<=answer is )(.*)(?=.)")
 
     dataset_path = "lukaemon/bbh"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
-        file_name = f"{task}.yaml"
-        try:
-            with open(f"{file_name}", "w") as f:
-                f.write("# Generated by _generate_configs.py\n")
-                yaml.dump(
-                    {
-                        "include": "_template_yaml",
-                        "task": f"{dataset_path.split('/')[-1]}_{task}",
-                        "dataset_name": task,
-                    },
-                    f,
-                )
-        except FileExistsError:
-            pass
 
+        resp = requests.get(f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt").content.decode('utf-8')
+        prompt = resp.split("\n-----\n")[-1]
+        description, *few_shot = prompt.split("\n\nQ:")
+
+        prefix_doc_to_text = ""
+        if args.fewshot:
+            if args.cot:
+                prefix_doc_to_text = " ".join(few_shot)
+            else:
+                for shot in few_shot:
+                    shot = "Q:"+shot
+                    try:
+                        answer = answer_regex.search(shot)[0]
+                    except:
+                        print("task", task)
+                        print(shot)
+                    example = shot.split("Let\'s think step by step.")[0]
+                    prefix_doc_to_text += f"{example}{answer}\n\n"
+
+        doc_to_text = prefix_doc_to_text + base_doc_to_text
+        if args.cot:
+            doc_to_text = doc_to_text + " Let's think step by step.\n"
+
+        yaml_dict = {
+                "include": "_template_yaml",
+                "task": f"bbh_{args.task_prefix}_{task}",
+                "dataset_name": task,
+                "description": description+"\n\n",
+                "doc_to_text": doc_to_text,
+            }
+
+        file_save_path = args.save_prefix_path + f"/{task}.yaml"
+        eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
 
-if __name__ == "__main__":
-    main()
 
 
-# https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/boolean_expressions.txt
diff --git a/lm_eval/tasks/bbh/_template_yaml b/lm_eval/tasks/bbh/_template_yaml
deleted file mode 100644
index 3b174480..00000000
--- a/lm_eval/tasks/bbh/_template_yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-group: bbh
-dataset_path: lukaemon/bbh
-output_type: greedy_until
-test_split: test
-doc_to_text: "Q: {{input}}\nA:"
-doc_to_target: "{{target}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-generation_kwargs:
-  until:
-    - "</s>"
-  do_sample: false
-  temperature: 0.0
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index d9dbf8c5..e435e57b 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -1,4 +1,4 @@
-group: bbh_flan_fewshot
+group: bbh_flan_cot_fewshot
 dataset_path: lukaemon/bbh
 output_type: greedy_until
 test_split: test
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index 0f6fc880..f660be72 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -1,4 +1,4 @@
-group: bbh_flan_zeroshot
+group: bbh_flan_cot_zeroshot
 dataset_path: lukaemon/bbh
 output_type: greedy_until
 test_split: test
-- 
GitLab


From 03be40e28b3ab9c0ea14ae454b883c08f5957ca0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 11:26:07 +0000
Subject: [PATCH 033/212] udpate

---
 lm_eval/tasks/bbh/README.md                   |  4 +++
 .../_flan_cot_fewshot_template_yaml           |  4 +--
 .../_flan_cot_zeroshot_template_yaml          |  4 +--
 .../flan_fewshot/_flan_fewshot_template_yaml  |  4 +--
 .../_flan_zeroshot_template_yaml              |  4 +--
 lm_eval/tasks/mmlu/_generate_configs.py       |  3 +--
 .../hendrycks_test_original_default.yaml      | 21 ---------------
 .../_mmlu_flan_cot_fewshot_template_yaml      | 27 +++++++++----------
 .../_mmlu_flan_generative_template_yaml       | 25 +++++++++--------
 .../_mmlu_flan_generative_template_yaml       | 16 ++++-------
 .../_mmlu_flan_loglikelihood_template_yaml    |  5 ++--
 .../flan_n_shot/mmlu_abstract_algebra.yaml    |  7 -----
 .../tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml  |  7 -----
 .../mmlu/flan_n_shot/mmlu_astronomy.yaml      |  7 -----
 .../flan_n_shot/mmlu_business_ethics.yaml     |  7 -----
 .../flan_n_shot/mmlu_clinical_knowledge.yaml  |  7 -----
 .../flan_n_shot/mmlu_college_biology.yaml     |  7 -----
 .../flan_n_shot/mmlu_college_chemistry.yaml   |  7 -----
 .../mmlu_college_computer_science.yaml        |  7 -----
 .../flan_n_shot/mmlu_college_mathematics.yaml |  7 -----
 .../flan_n_shot/mmlu_college_medicine.yaml    |  7 -----
 .../flan_n_shot/mmlu_college_physics.yaml     |  7 -----
 .../flan_n_shot/mmlu_computer_security.yaml   |  7 -----
 .../flan_n_shot/mmlu_conceptual_physics.yaml  |  7 -----
 .../mmlu/flan_n_shot/mmlu_econometrics.yaml   |  7 -----
 .../mmlu_electrical_engineering.yaml          |  7 -----
 .../mmlu_elementary_mathematics.yaml          |  7 -----
 .../mmlu/flan_n_shot/mmlu_formal_logic.yaml   |  7 -----
 .../mmlu/flan_n_shot/mmlu_global_facts.yaml   |  7 -----
 .../flan_n_shot/mmlu_high_school_biology.yaml |  7 -----
 .../mmlu_high_school_chemistry.yaml           |  7 -----
 .../mmlu_high_school_computer_science.yaml    |  7 -----
 .../mmlu_high_school_european_history.yaml    |  7 -----
 .../mmlu_high_school_geography.yaml           |  7 -----
 ...u_high_school_government_and_politics.yaml |  7 -----
 .../mmlu_high_school_macroeconomics.yaml      |  7 -----
 .../mmlu_high_school_mathematics.yaml         |  7 -----
 .../mmlu_high_school_microeconomics.yaml      |  7 -----
 .../flan_n_shot/mmlu_high_school_physics.yaml |  7 -----
 .../mmlu_high_school_psychology.yaml          |  7 -----
 .../mmlu_high_school_statistics.yaml          |  7 -----
 .../mmlu_high_school_us_history.yaml          |  7 -----
 .../mmlu_high_school_world_history.yaml       |  7 -----
 .../mmlu/flan_n_shot/mmlu_human_aging.yaml    |  7 -----
 .../flan_n_shot/mmlu_human_sexuality.yaml     |  7 -----
 .../flan_n_shot/mmlu_international_law.yaml   |  7 -----
 .../mmlu/flan_n_shot/mmlu_jurisprudence.yaml  |  7 -----
 .../flan_n_shot/mmlu_logical_fallacies.yaml   |  7 -----
 .../mmlu_loglikelihood_abstract_algebra.yaml  |  7 -----
 .../mmlu_loglikelihood_anatomy.yaml           |  7 -----
 .../mmlu_loglikelihood_astronomy.yaml         |  7 -----
 .../mmlu_loglikelihood_business_ethics.yaml   |  7 -----
 ...mmlu_loglikelihood_clinical_knowledge.yaml |  7 -----
 .../mmlu_loglikelihood_college_biology.yaml   |  7 -----
 .../mmlu_loglikelihood_college_chemistry.yaml |  7 -----
 ...oglikelihood_college_computer_science.yaml |  7 -----
 ...mlu_loglikelihood_college_mathematics.yaml |  7 -----
 .../mmlu_loglikelihood_college_medicine.yaml  |  7 -----
 .../mmlu_loglikelihood_college_physics.yaml   |  7 -----
 .../mmlu_loglikelihood_computer_security.yaml |  7 -----
 ...mmlu_loglikelihood_conceptual_physics.yaml |  7 -----
 .../mmlu_loglikelihood_econometrics.yaml      |  7 -----
 ..._loglikelihood_electrical_engineering.yaml |  7 -----
 ..._loglikelihood_elementary_mathematics.yaml |  7 -----
 .../mmlu_loglikelihood_formal_logic.yaml      |  7 -----
 .../mmlu_loglikelihood_global_facts.yaml      |  7 -----
 ...mlu_loglikelihood_high_school_biology.yaml |  7 -----
 ...u_loglikelihood_high_school_chemistry.yaml |  7 -----
 ...kelihood_high_school_computer_science.yaml |  7 -----
 ...kelihood_high_school_european_history.yaml |  7 -----
 ...u_loglikelihood_high_school_geography.yaml |  7 -----
 ...d_high_school_government_and_politics.yaml |  7 -----
 ...likelihood_high_school_macroeconomics.yaml |  7 -----
 ...loglikelihood_high_school_mathematics.yaml |  7 -----
 ...likelihood_high_school_microeconomics.yaml |  7 -----
 ...mlu_loglikelihood_high_school_physics.yaml |  7 -----
 ..._loglikelihood_high_school_psychology.yaml |  7 -----
 ..._loglikelihood_high_school_statistics.yaml |  7 -----
 ..._loglikelihood_high_school_us_history.yaml |  7 -----
 ...glikelihood_high_school_world_history.yaml |  7 -----
 .../mmlu_loglikelihood_human_aging.yaml       |  7 -----
 .../mmlu_loglikelihood_human_sexuality.yaml   |  7 -----
 .../mmlu_loglikelihood_international_law.yaml |  7 -----
 .../mmlu_loglikelihood_jurisprudence.yaml     |  7 -----
 .../mmlu_loglikelihood_logical_fallacies.yaml |  7 -----
 .../mmlu_loglikelihood_machine_learning.yaml  |  7 -----
 .../mmlu_loglikelihood_management.yaml        |  7 -----
 .../mmlu_loglikelihood_marketing.yaml         |  7 -----
 .../mmlu_loglikelihood_medical_genetics.yaml  |  7 -----
 .../mmlu_loglikelihood_miscellaneous.yaml     |  7 -----
 .../mmlu_loglikelihood_moral_disputes.yaml    |  7 -----
 .../mmlu_loglikelihood_moral_scenarios.yaml   |  7 -----
 .../mmlu_loglikelihood_nutrition.yaml         |  7 -----
 .../mmlu_loglikelihood_philosophy.yaml        |  7 -----
 .../mmlu_loglikelihood_prehistory.yaml        |  7 -----
 ...loglikelihood_professional_accounting.yaml |  7 -----
 .../mmlu_loglikelihood_professional_law.yaml  |  7 -----
 ...u_loglikelihood_professional_medicine.yaml |  7 -----
 ...loglikelihood_professional_psychology.yaml |  7 -----
 .../mmlu_loglikelihood_public_relations.yaml  |  7 -----
 .../mmlu_loglikelihood_security_studies.yaml  |  7 -----
 .../mmlu_loglikelihood_sociology.yaml         |  7 -----
 .../mmlu_loglikelihood_us_foreign_policy.yaml |  7 -----
 .../mmlu_loglikelihood_virology.yaml          |  7 -----
 .../mmlu_loglikelihood_world_religions.yaml   |  7 -----
 .../flan_n_shot/mmlu_machine_learning.yaml    |  7 -----
 .../mmlu/flan_n_shot/mmlu_management.yaml     |  7 -----
 .../mmlu/flan_n_shot/mmlu_marketing.yaml      |  7 -----
 .../flan_n_shot/mmlu_medical_genetics.yaml    |  7 -----
 .../mmlu/flan_n_shot/mmlu_miscellaneous.yaml  |  7 -----
 .../mmlu/flan_n_shot/mmlu_moral_disputes.yaml |  7 -----
 .../flan_n_shot/mmlu_moral_scenarios.yaml     |  7 -----
 .../mmlu/flan_n_shot/mmlu_nutrition.yaml      |  7 -----
 .../mmlu/flan_n_shot/mmlu_philosophy.yaml     |  7 -----
 .../mmlu/flan_n_shot/mmlu_prehistory.yaml     |  7 -----
 .../mmlu_professional_accounting.yaml         |  7 -----
 .../flan_n_shot/mmlu_professional_law.yaml    |  7 -----
 .../mmlu_professional_medicine.yaml           |  7 -----
 .../mmlu_professional_psychology.yaml         |  7 -----
 .../flan_n_shot/mmlu_public_relations.yaml    |  7 -----
 .../flan_n_shot/mmlu_security_studies.yaml    |  7 -----
 .../mmlu/flan_n_shot/mmlu_sociology.yaml      |  7 -----
 .../flan_n_shot/mmlu_us_foreign_policy.yaml   |  7 -----
 .../tasks/mmlu/flan_n_shot/mmlu_virology.yaml |  7 -----
 .../flan_n_shot/mmlu_world_religions.yaml     |  7 -----
 125 files changed, 45 insertions(+), 870 deletions(-)
 delete mode 100644 lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml

diff --git a/lm_eval/tasks/bbh/README.md b/lm_eval/tasks/bbh/README.md
index 4f8dad49..eb3090bd 100644
--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -26,6 +26,10 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
 #### Groups
 
 - `bbh_flan_zeroshot`
+- `bbh_flan_fewshot`
+- `bbh_flan_cot_fewshot`
+- `bbh_flan_cot_zeroshot`
+
 
 #### Tasks
 
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index e435e57b..34d7f066 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index f660be72..bda6eb96 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
index ff53aabc..89e5de29 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
index 832c728d..66dbf369 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index af9bd0c6..36a3936c 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -92,7 +92,6 @@ if __name__ == "__main__":
     base_yaml_name = os.path.split(args.base_yaml_path)[-1]
     with open(args.base_yaml_path) as f:
         base_yaml = yaml.full_load(f)
-    print(base_yaml)
 
     if args.cot_prompt_path is not None:
         import json
@@ -115,4 +114,4 @@ if __name__ == "__main__":
         file_save_path = args.save_prefix_path + f"_{subject}.yaml"
         eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
         with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file, width=float("inf"))
+            yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
diff --git a/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml b/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
deleted file mode 100644
index 248e7561..00000000
--- a/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-group:
-  - mmlu
-  - mmlu_original
-  - multiple_choice
-task: mmlu_original_abstract_algebra
-dataset_path: cais/mmlu
-dataset_name: abstract_algebra
-output_type: multiple_choice
-validation_split: validation
-test_split: test
-description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
-doc_to_choice: ["A", "B", "C", "D"]
-doc_to_target: "{{answer}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index 64587d54..ffa9ee87 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,24 +2,23 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
-fewshot_delimiter: ""
 output_type: greedy_until
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
 generation_kwargs:
   until:
     - "</s>"
   do_sample: false
   temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(.)"
-      - function: "take_first"
\ No newline at end of file
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
index e5b8e429..c9b03734 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -2,24 +2,23 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 output_type: greedy_until
-fewshot_delimiter: ""
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
     ignore_case: true
     ignore_punctuation: true
-generation_kwargs:
-  until:
-    - "</s>"
-  do_sample: false
-  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(.)"
-      - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index c52f8dc7..b1ff96a8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -2,19 +2,13 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 output_type: greedy_until
-# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
-doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    # ignore_case: true
-    # ignore_punctuation: true
-generation_kwargs:
-  until:
-    - "</s>"
-#   do_sample: false
-#   temperature: 0.0
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 2162ade8..2d5d92ef 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -1,11 +1,10 @@
 group: mmlu_flan_n_shot_loglikelihood
 dataset_path: cais/mmlu
-# validation_split: validation
 test_split: test
 fewshot_split: dev
 output_type: multiple_choice
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
-doc_to_choice: ["A", "B", "C", "D"]
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
 doc_to_target: answer
 metric_list:
   - metric: acc
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
deleted file mode 100644
index 31729f37..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract algebra.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
deleted file mode 100644
index e8978402..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: anatomy
-description: 'The following are multiple choice questions (with answers) about anatomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
deleted file mode 100644
index 66902758..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: astronomy
-description: 'The following are multiple choice questions (with answers) about astronomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
deleted file mode 100644
index d1dcf3c7..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business ethics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
deleted file mode 100644
index 14b12359..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical knowledge.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
deleted file mode 100644
index 0d202b8e..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
deleted file mode 100644
index 77f6328f..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
deleted file mode 100644
index f5cbda28..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
deleted file mode 100644
index dbc9be4c..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
deleted file mode 100644
index efc868f0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_medicine.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
deleted file mode 100644
index d92c14ea..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
deleted file mode 100644
index 3ddf3ee5..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_computer_security.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer security.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
deleted file mode 100644
index 7c4f90ed..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_conceptual_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
deleted file mode 100644
index b46c90cb..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_econometrics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: econometrics
-description: 'The following are multiple choice questions (with answers) about econometrics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
deleted file mode 100644
index 0308fe16..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_electrical_engineering.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical engineering.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
deleted file mode 100644
index 2b8a8caf..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_elementary_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
deleted file mode 100644
index 10f58f41..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_formal_logic.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal logic.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
deleted file mode 100644
index 48816fe3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_global_facts.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global facts.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
deleted file mode 100644
index ebb1ded2..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_biology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high school biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
deleted file mode 100644
index 66a484a3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_chemistry.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high school chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
deleted file mode 100644
index b9a9060c..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_computer_science.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high school computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
deleted file mode 100644
index f89cca29..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_european_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high school european history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
deleted file mode 100644
index f255d37a..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_geography.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high school geography.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
deleted file mode 100644
index 108aebf8..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_government_and_politics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high school government and politics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
deleted file mode 100644
index 720baeac..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_macroeconomics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high school macroeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
deleted file mode 100644
index fbad67d6..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high school mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
deleted file mode 100644
index 4b4e85bd..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_microeconomics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high school microeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
deleted file mode 100644
index 941d6c22..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high school physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
deleted file mode 100644
index 831907f0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_psychology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high school psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
deleted file mode 100644
index 255c7394..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_statistics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high school statistics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
deleted file mode 100644
index 4ea76cd3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_us_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high school us history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
deleted file mode 100644
index 26551e82..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_high_school_world_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high school world history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
deleted file mode 100644
index 042e81cf..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_aging.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human aging.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
deleted file mode 100644
index d2d55b70..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_human_sexuality.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human sexuality.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
deleted file mode 100644
index 12b18807..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_international_law.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_international_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
deleted file mode 100644
index 51613f16..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_jurisprudence.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
deleted file mode 100644
index aaaef665..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_logical_fallacies.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical fallacies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
deleted file mode 100644
index 19f6d3ee..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_abstract_algebra.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract algebra.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
deleted file mode 100644
index ff927e05..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_anatomy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: anatomy
-description: 'The following are multiple choice questions (with answers) about anatomy.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
deleted file mode 100644
index 95329c44..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_astronomy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: astronomy
-description: 'The following are multiple choice questions (with answers) about astronomy.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
deleted file mode 100644
index b6917938..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_business_ethics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business ethics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
deleted file mode 100644
index bc19e7c5..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_clinical_knowledge.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical knowledge.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
deleted file mode 100644
index defc3d98..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_biology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college biology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
deleted file mode 100644
index 15a2b3cc..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_chemistry.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college chemistry.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
deleted file mode 100644
index ff69b70e..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_computer_science.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college computer science.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
deleted file mode 100644
index fb67c2ee..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college mathematics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
deleted file mode 100644
index 6edac775..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_medicine.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college medicine.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
deleted file mode 100644
index 2af2929f..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_college_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college physics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
deleted file mode 100644
index fe239463..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_computer_security.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer security.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
deleted file mode 100644
index a593cb97..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_conceptual_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual physics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
deleted file mode 100644
index 034c0e63..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_econometrics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: econometrics
-description: 'The following are multiple choice questions (with answers) about econometrics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
deleted file mode 100644
index 06e8e3b0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_electrical_engineering.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical engineering.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
deleted file mode 100644
index ea151100..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_elementary_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary mathematics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
deleted file mode 100644
index 2a2299fb..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_formal_logic.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal logic.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
deleted file mode 100644
index b3c003f3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_global_facts.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global facts.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
deleted file mode 100644
index 6a28adbd..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_biology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high school biology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
deleted file mode 100644
index dedab0ad..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_chemistry.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high school chemistry.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
deleted file mode 100644
index 8c096fd8..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_computer_science.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high school computer science.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
deleted file mode 100644
index 8b2a2705..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_european_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high school european history.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
deleted file mode 100644
index 32bcc3e1..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_geography.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high school geography.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
deleted file mode 100644
index 191bc63b..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_government_and_politics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high school government and politics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
deleted file mode 100644
index 838ffed9..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_macroeconomics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high school macroeconomics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
deleted file mode 100644
index 246d8988..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_mathematics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high school mathematics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
deleted file mode 100644
index 1fea1850..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_microeconomics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high school microeconomics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
deleted file mode 100644
index 6aa802e8..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_physics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high school physics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
deleted file mode 100644
index 521b3e54..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_psychology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high school psychology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
deleted file mode 100644
index 3cd82472..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_statistics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high school statistics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
deleted file mode 100644
index 34a7d05d..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_us_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high school us history.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
deleted file mode 100644
index b6390aa3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_high_school_world_history.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high school world history.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
deleted file mode 100644
index bf454427..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_aging.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human aging.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
deleted file mode 100644
index 7bec1aa9..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_human_sexuality.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human sexuality.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
deleted file mode 100644
index 6d56237f..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_international_law.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international law.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_international_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
deleted file mode 100644
index a95f42ed..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_jurisprudence.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
deleted file mode 100644
index bc2b1b41..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_logical_fallacies.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical fallacies.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
deleted file mode 100644
index 3688fd28..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_machine_learning.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine learning.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
deleted file mode 100644
index 70eb8768..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_management.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_management
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
deleted file mode 100644
index 48c03524..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_marketing.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_marketing
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
deleted file mode 100644
index 68bd9c12..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_medical_genetics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical genetics.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
deleted file mode 100644
index bde2352b..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_miscellaneous.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
deleted file mode 100644
index c55d44bd..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_disputes.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral disputes.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
deleted file mode 100644
index 99bcb848..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_moral_scenarios.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral scenarios.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
deleted file mode 100644
index 34c0040a..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_nutrition.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
deleted file mode 100644
index 83588531..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_philosophy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: philosophy
-description: 'The following are multiple choice questions (with answers) about philosophy.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
deleted file mode 100644
index a94b514b..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_prehistory.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
deleted file mode 100644
index 2f64beae..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_accounting.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional accounting.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
deleted file mode 100644
index de0e8392..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_law.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional law.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
deleted file mode 100644
index ef4d0c07..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_medicine.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional medicine.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
deleted file mode 100644
index 9bb12be0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_professional_psychology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional psychology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
deleted file mode 100644
index e9a761cc..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_public_relations.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public relations.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
deleted file mode 100644
index 6a141ba6..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_security_studies.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security studies.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
deleted file mode 100644
index 11069f9e..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_sociology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_sociology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
deleted file mode 100644
index 7aa77456..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_us_foreign_policy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us foreign policy.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
deleted file mode 100644
index 9a9b94a1..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_virology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_virology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
deleted file mode 100644
index 6f26fda1..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_loglikelihood_world_religions.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world religions.
-
-
-  '
-include: _mmlu_flan_loglikelihood_template_yaml
-task: mmlu_flan_n_shot_loglikelihood_world_religions
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
deleted file mode 100644
index c97c9f09..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_machine_learning.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine learning.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
deleted file mode 100644
index 9c0c65b0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_management.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_management
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
deleted file mode 100644
index e2a74ca0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_marketing.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_marketing
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
deleted file mode 100644
index 0464b15c..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_medical_genetics.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical genetics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
deleted file mode 100644
index 389ca552..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_miscellaneous.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
deleted file mode 100644
index 671ca84e..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_disputes.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral disputes.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
deleted file mode 100644
index 1ecbff40..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_moral_scenarios.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral scenarios.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
deleted file mode 100644
index 6d2da5cb..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_nutrition.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
deleted file mode 100644
index 421c50f9..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_philosophy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: philosophy
-description: 'The following are multiple choice questions (with answers) about philosophy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
deleted file mode 100644
index 6e534911..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_prehistory.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
deleted file mode 100644
index 93b4e4d3..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_accounting.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional accounting.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
deleted file mode 100644
index a8704652..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_law.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
deleted file mode 100644
index 137a39d5..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_medicine.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
deleted file mode 100644
index 342031f7..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_professional_psychology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
deleted file mode 100644
index 88ffe1b8..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_public_relations.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public relations.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
deleted file mode 100644
index b56c6803..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_security_studies.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security studies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
deleted file mode 100644
index bca11a0a..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_sociology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_sociology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
deleted file mode 100644
index 797fc9b7..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_us_foreign_policy.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us foreign policy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
deleted file mode 100644
index 6f6d1680..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_virology.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_virology
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
deleted file mode 100644
index b0253c46..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_world_religions.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world religions.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_world_religions
-- 
GitLab


From 4a7528679fcbc71e5ce51c2686908ed728e1a17a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 11:32:49 +0000
Subject: [PATCH 034/212] update flan mmlu

---
 lm_eval/tasks/mmlu/_generate_configs.py           |  2 +-
 lm_eval/tasks/mmlu/default/_default_template_yaml | 15 +++++++++++++++
 .../tasks/mmlu/default/mmlu_abstract_algebra.yaml |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml      |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml    |  4 ++++
 .../tasks/mmlu/default/mmlu_business_ethics.yaml  |  4 ++++
 .../mmlu/default/mmlu_clinical_knowledge.yaml     |  4 ++++
 .../tasks/mmlu/default/mmlu_college_biology.yaml  |  4 ++++
 .../mmlu/default/mmlu_college_chemistry.yaml      |  4 ++++
 .../default/mmlu_college_computer_science.yaml    |  4 ++++
 .../mmlu/default/mmlu_college_mathematics.yaml    |  4 ++++
 .../tasks/mmlu/default/mmlu_college_medicine.yaml |  4 ++++
 .../tasks/mmlu/default/mmlu_college_physics.yaml  |  4 ++++
 .../mmlu/default/mmlu_computer_security.yaml      |  4 ++++
 .../mmlu/default/mmlu_conceptual_physics.yaml     |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml |  4 ++++
 .../mmlu/default/mmlu_electrical_engineering.yaml |  4 ++++
 .../mmlu/default/mmlu_elementary_mathematics.yaml |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml |  4 ++++
 .../mmlu/default/mmlu_high_school_biology.yaml    |  4 ++++
 .../mmlu/default/mmlu_high_school_chemistry.yaml  |  4 ++++
 .../mmlu_high_school_computer_science.yaml        |  4 ++++
 .../mmlu_high_school_european_history.yaml        |  4 ++++
 .../mmlu/default/mmlu_high_school_geography.yaml  |  4 ++++
 .../mmlu_high_school_government_and_politics.yaml |  4 ++++
 .../default/mmlu_high_school_macroeconomics.yaml  |  4 ++++
 .../default/mmlu_high_school_mathematics.yaml     |  4 ++++
 .../default/mmlu_high_school_microeconomics.yaml  |  4 ++++
 .../mmlu/default/mmlu_high_school_physics.yaml    |  4 ++++
 .../mmlu/default/mmlu_high_school_psychology.yaml |  4 ++++
 .../mmlu/default/mmlu_high_school_statistics.yaml |  4 ++++
 .../mmlu/default/mmlu_high_school_us_history.yaml |  4 ++++
 .../default/mmlu_high_school_world_history.yaml   |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml  |  4 ++++
 .../tasks/mmlu/default/mmlu_human_sexuality.yaml  |  4 ++++
 .../mmlu/default/mmlu_international_law.yaml      |  4 ++++
 .../tasks/mmlu/default/mmlu_jurisprudence.yaml    |  4 ++++
 .../mmlu/default/mmlu_logical_fallacies.yaml      |  4 ++++
 .../tasks/mmlu/default/mmlu_machine_learning.yaml |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_management.yaml   |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_marketing.yaml    |  4 ++++
 .../tasks/mmlu/default/mmlu_medical_genetics.yaml |  4 ++++
 .../tasks/mmlu/default/mmlu_miscellaneous.yaml    |  4 ++++
 .../tasks/mmlu/default/mmlu_moral_disputes.yaml   |  4 ++++
 .../tasks/mmlu/default/mmlu_moral_scenarios.yaml  |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml    |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml   |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml   |  4 ++++
 .../default/mmlu_professional_accounting.yaml     |  4 ++++
 .../tasks/mmlu/default/mmlu_professional_law.yaml |  4 ++++
 .../mmlu/default/mmlu_professional_medicine.yaml  |  4 ++++
 .../default/mmlu_professional_psychology.yaml     |  4 ++++
 .../tasks/mmlu/default/mmlu_public_relations.yaml |  4 ++++
 .../tasks/mmlu/default/mmlu_security_studies.yaml |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_sociology.yaml    |  4 ++++
 .../mmlu/default/mmlu_us_foreign_policy.yaml      |  4 ++++
 lm_eval/tasks/mmlu/default/mmlu_virology.yaml     |  4 ++++
 .../tasks/mmlu/default/mmlu_world_religions.yaml  |  4 ++++
 .../flan_n_shot/mmlu_gen_abstract_algebra.yaml    |  4 ++++
 .../tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_astronomy.yaml      |  4 ++++
 .../flan_n_shot/mmlu_gen_business_ethics.yaml     |  4 ++++
 .../flan_n_shot/mmlu_gen_clinical_knowledge.yaml  |  4 ++++
 .../flan_n_shot/mmlu_gen_college_biology.yaml     |  4 ++++
 .../flan_n_shot/mmlu_gen_college_chemistry.yaml   |  4 ++++
 .../mmlu_gen_college_computer_science.yaml        |  4 ++++
 .../flan_n_shot/mmlu_gen_college_mathematics.yaml |  4 ++++
 .../flan_n_shot/mmlu_gen_college_medicine.yaml    |  4 ++++
 .../flan_n_shot/mmlu_gen_college_physics.yaml     |  4 ++++
 .../flan_n_shot/mmlu_gen_computer_security.yaml   |  4 ++++
 .../flan_n_shot/mmlu_gen_conceptual_physics.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_econometrics.yaml   |  4 ++++
 .../mmlu_gen_electrical_engineering.yaml          |  4 ++++
 .../mmlu_gen_elementary_mathematics.yaml          |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml   |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_global_facts.yaml   |  4 ++++
 .../flan_n_shot/mmlu_gen_high_school_biology.yaml |  4 ++++
 .../mmlu_gen_high_school_chemistry.yaml           |  4 ++++
 .../mmlu_gen_high_school_computer_science.yaml    |  4 ++++
 .../mmlu_gen_high_school_european_history.yaml    |  4 ++++
 .../mmlu_gen_high_school_geography.yaml           |  4 ++++
 ...u_gen_high_school_government_and_politics.yaml |  4 ++++
 .../mmlu_gen_high_school_macroeconomics.yaml      |  4 ++++
 .../mmlu_gen_high_school_mathematics.yaml         |  4 ++++
 .../mmlu_gen_high_school_microeconomics.yaml      |  4 ++++
 .../flan_n_shot/mmlu_gen_high_school_physics.yaml |  4 ++++
 .../mmlu_gen_high_school_psychology.yaml          |  4 ++++
 .../mmlu_gen_high_school_statistics.yaml          |  4 ++++
 .../mmlu_gen_high_school_us_history.yaml          |  4 ++++
 .../mmlu_gen_high_school_world_history.yaml       |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_human_aging.yaml    |  4 ++++
 .../flan_n_shot/mmlu_gen_human_sexuality.yaml     |  4 ++++
 .../flan_n_shot/mmlu_gen_international_law.yaml   |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml  |  4 ++++
 .../flan_n_shot/mmlu_gen_logical_fallacies.yaml   |  4 ++++
 .../flan_n_shot/mmlu_gen_machine_learning.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_management.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_marketing.yaml      |  4 ++++
 .../flan_n_shot/mmlu_gen_medical_genetics.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml |  4 ++++
 .../flan_n_shot/mmlu_gen_moral_scenarios.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_nutrition.yaml      |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_philosophy.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_prehistory.yaml     |  4 ++++
 .../mmlu_gen_professional_accounting.yaml         |  4 ++++
 .../flan_n_shot/mmlu_gen_professional_law.yaml    |  4 ++++
 .../mmlu_gen_professional_medicine.yaml           |  4 ++++
 .../mmlu_gen_professional_psychology.yaml         |  4 ++++
 .../flan_n_shot/mmlu_gen_public_relations.yaml    |  4 ++++
 .../flan_n_shot/mmlu_gen_security_studies.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_gen_sociology.yaml      |  4 ++++
 .../flan_n_shot/mmlu_gen_us_foreign_policy.yaml   |  4 ++++
 .../tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml |  4 ++++
 .../flan_n_shot/mmlu_gen_world_religions.yaml     |  4 ++++
 .../flan_n_shot/mmlu_log_abstract_algebra.yaml    |  4 ++++
 .../tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_astronomy.yaml      |  4 ++++
 .../flan_n_shot/mmlu_log_business_ethics.yaml     |  4 ++++
 .../flan_n_shot/mmlu_log_clinical_knowledge.yaml  |  4 ++++
 .../flan_n_shot/mmlu_log_college_biology.yaml     |  4 ++++
 .../flan_n_shot/mmlu_log_college_chemistry.yaml   |  4 ++++
 .../mmlu_log_college_computer_science.yaml        |  4 ++++
 .../flan_n_shot/mmlu_log_college_mathematics.yaml |  4 ++++
 .../flan_n_shot/mmlu_log_college_medicine.yaml    |  4 ++++
 .../flan_n_shot/mmlu_log_college_physics.yaml     |  4 ++++
 .../flan_n_shot/mmlu_log_computer_security.yaml   |  4 ++++
 .../flan_n_shot/mmlu_log_conceptual_physics.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_econometrics.yaml   |  4 ++++
 .../mmlu_log_electrical_engineering.yaml          |  4 ++++
 .../mmlu_log_elementary_mathematics.yaml          |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_formal_logic.yaml   |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_global_facts.yaml   |  4 ++++
 .../flan_n_shot/mmlu_log_high_school_biology.yaml |  4 ++++
 .../mmlu_log_high_school_chemistry.yaml           |  4 ++++
 .../mmlu_log_high_school_computer_science.yaml    |  4 ++++
 .../mmlu_log_high_school_european_history.yaml    |  4 ++++
 .../mmlu_log_high_school_geography.yaml           |  4 ++++
 ...u_log_high_school_government_and_politics.yaml |  4 ++++
 .../mmlu_log_high_school_macroeconomics.yaml      |  4 ++++
 .../mmlu_log_high_school_mathematics.yaml         |  4 ++++
 .../mmlu_log_high_school_microeconomics.yaml      |  4 ++++
 .../flan_n_shot/mmlu_log_high_school_physics.yaml |  4 ++++
 .../mmlu_log_high_school_psychology.yaml          |  4 ++++
 .../mmlu_log_high_school_statistics.yaml          |  4 ++++
 .../mmlu_log_high_school_us_history.yaml          |  4 ++++
 .../mmlu_log_high_school_world_history.yaml       |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_human_aging.yaml    |  4 ++++
 .../flan_n_shot/mmlu_log_human_sexuality.yaml     |  4 ++++
 .../flan_n_shot/mmlu_log_international_law.yaml   |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml  |  4 ++++
 .../flan_n_shot/mmlu_log_logical_fallacies.yaml   |  4 ++++
 .../flan_n_shot/mmlu_log_machine_learning.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_management.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_marketing.yaml      |  4 ++++
 .../flan_n_shot/mmlu_log_medical_genetics.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml  |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml |  4 ++++
 .../flan_n_shot/mmlu_log_moral_scenarios.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_nutrition.yaml      |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_philosophy.yaml     |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_prehistory.yaml     |  4 ++++
 .../mmlu_log_professional_accounting.yaml         |  4 ++++
 .../flan_n_shot/mmlu_log_professional_law.yaml    |  4 ++++
 .../mmlu_log_professional_medicine.yaml           |  4 ++++
 .../mmlu_log_professional_psychology.yaml         |  4 ++++
 .../flan_n_shot/mmlu_log_public_relations.yaml    |  4 ++++
 .../flan_n_shot/mmlu_log_security_studies.yaml    |  4 ++++
 .../mmlu/flan_n_shot/mmlu_log_sociology.yaml      |  4 ++++
 .../flan_n_shot/mmlu_log_us_foreign_policy.yaml   |  4 ++++
 .../tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml |  4 ++++
 .../flan_n_shot/mmlu_log_world_religions.yaml     |  4 ++++
 173 files changed, 700 insertions(+), 1 deletion(-)
 create mode 100644 lm_eval/tasks/mmlu/default/_default_template_yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml

diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 36a3936c..f0dbd6bd 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -106,7 +106,7 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
-            "task": f"mmlu_{args.task_prefix}_{subject}",
+            "task": f"mmlu_{args.task_prefix}_{subject}" if args.task_prefix != "" else f"mmlu_{subject}",
             "dataset_name": subject,
             "description": description,
         }
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
new file mode 100644
index 00000000..757f7b5e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -0,0 +1,15 @@
+group: mmlu
+dataset_path: cais/mmlu
+test_split: test
+fewshot_split: dev
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
new file mode 100644
index 00000000..b6d595d3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
new file mode 100644
index 00000000..6459cb41
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_anatomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
new file mode 100644
index 00000000..573dedd7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_astronomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
new file mode 100644
index 00000000..4b20b795
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_business_ethics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
new file mode 100644
index 00000000..f758e66d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
new file mode 100644
index 00000000..f8069007
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
new file mode 100644
index 00000000..e03fbccd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
new file mode 100644
index 00000000..a9d4a6f2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
new file mode 100644
index 00000000..f6a86179
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
new file mode 100644
index 00000000..0ea75fb3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
new file mode 100644
index 00000000..82f13e40
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
new file mode 100644
index 00000000..e9e06de2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_computer_security"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
new file mode 100644
index 00000000..30ca6efe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
new file mode 100644
index 00000000..680cc507
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_econometrics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
new file mode 100644
index 00000000..8dd63b33
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
new file mode 100644
index 00000000..4979ee30
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
new file mode 100644
index 00000000..9b73509b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_formal_logic"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
new file mode 100644
index 00000000..8c43a6c9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_global_facts"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
new file mode 100644
index 00000000..453d3033
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
new file mode 100644
index 00000000..714ee0e5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
new file mode 100644
index 00000000..9326e259
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
new file mode 100644
index 00000000..e212cd22
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
new file mode 100644
index 00000000..a7fffc25
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..7255d60f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..29d9ddd7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
new file mode 100644
index 00000000..035e7a12
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 00000000..72b1c8cf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
new file mode 100644
index 00000000..ef8f6ca5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
new file mode 100644
index 00000000..5c4cce75
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
new file mode 100644
index 00000000..20ed42ec
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
new file mode 100644
index 00000000..18cd48da
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
new file mode 100644
index 00000000..b17daac6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
new file mode 100644
index 00000000..080b2676
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_human_aging"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
new file mode 100644
index 00000000..ca3389fe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
new file mode 100644
index 00000000..a3d443e0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_international_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
new file mode 100644
index 00000000..4ba00a2a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
new file mode 100644
index 00000000..ea45a4f3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
new file mode 100644
index 00000000..2ba6d162
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_machine_learning"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_management.yaml b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
new file mode 100644
index 00000000..b4ea6da9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_management"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
new file mode 100644
index 00000000..afa30a0c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_marketing"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
new file mode 100644
index 00000000..92095635
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
new file mode 100644
index 00000000..94ebd1b0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
new file mode 100644
index 00000000..8bea0a1f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
new file mode 100644
index 00000000..71dcc693
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
new file mode 100644
index 00000000..e6b4cbcd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_nutrition"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
new file mode 100644
index 00000000..b9a0b2c5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_philosophy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
new file mode 100644
index 00000000..7f71bd54
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_prehistory"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
new file mode 100644
index 00000000..94ca6e6e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
new file mode 100644
index 00000000..074c34e6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_professional_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
new file mode 100644
index 00000000..2f99c316
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
new file mode 100644
index 00000000..01565848
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
new file mode 100644
index 00000000..0d46c66e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_public_relations"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
new file mode 100644
index 00000000..f30dffde
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_security_studies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
new file mode 100644
index 00000000..c36bd403
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_sociology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
new file mode 100644
index 00000000..fe8c68d8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
new file mode 100644
index 00000000..4cbd0959
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_virology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
new file mode 100644
index 00000000..375efbae
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
new file mode 100644
index 00000000..49b9c425
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
new file mode 100644
index 00000000..0c8d7914
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
new file mode 100644
index 00000000..c92a1027
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
new file mode 100644
index 00000000..4b65902e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
new file mode 100644
index 00000000..295fb234
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
new file mode 100644
index 00000000..f945181b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
new file mode 100644
index 00000000..1fdab27d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
new file mode 100644
index 00000000..6b41a5bb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
new file mode 100644
index 00000000..29e80a5e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
new file mode 100644
index 00000000..a5061541
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
new file mode 100644
index 00000000..ec3262ee
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
new file mode 100644
index 00000000..a9ade9c5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
new file mode 100644
index 00000000..5a903a65
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
new file mode 100644
index 00000000..847c8ce6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
new file mode 100644
index 00000000..038379e0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
new file mode 100644
index 00000000..4fd779de
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
new file mode 100644
index 00000000..bb528831
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
new file mode 100644
index 00000000..1145dcab
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
new file mode 100644
index 00000000..574a0c58
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
new file mode 100644
index 00000000..ef79ed73
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
new file mode 100644
index 00000000..9d9200a6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
new file mode 100644
index 00000000..e4b52a9c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
new file mode 100644
index 00000000..8403d20e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..50ad3863
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..18bfb8b1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
new file mode 100644
index 00000000..1b04a06f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
new file mode 100644
index 00000000..9588af59
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
new file mode 100644
index 00000000..4aa033c8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
new file mode 100644
index 00000000..168c0c15
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
new file mode 100644
index 00000000..ba195da9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
new file mode 100644
index 00000000..0605fbc4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
new file mode 100644
index 00000000..aa54d758
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
new file mode 100644
index 00000000..d47b7fef
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
new file mode 100644
index 00000000..9be15e54
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
new file mode 100644
index 00000000..b80c9d58
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
new file mode 100644
index 00000000..5e7a5395
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
new file mode 100644
index 00000000..fcb718a0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
new file mode 100644
index 00000000..d879b54c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
new file mode 100644
index 00000000..887c71a3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
new file mode 100644
index 00000000..bad500ca
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
new file mode 100644
index 00000000..c4faff12
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
new file mode 100644
index 00000000..e9aac340
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
new file mode 100644
index 00000000..41af33e0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
new file mode 100644
index 00000000..1689c3d9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
new file mode 100644
index 00000000..24be1a63
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
new file mode 100644
index 00000000..01040729
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
new file mode 100644
index 00000000..fc5a6fbe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
new file mode 100644
index 00000000..cbdd2f0d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
new file mode 100644
index 00000000..42e46529
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
new file mode 100644
index 00000000..a64610e6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
new file mode 100644
index 00000000..b0c574fe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
new file mode 100644
index 00000000..ff1030fc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
new file mode 100644
index 00000000..25555da4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
new file mode 100644
index 00000000..f8ac254c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
new file mode 100644
index 00000000..af3917ac
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
new file mode 100644
index 00000000..b8df2d59
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
new file mode 100644
index 00000000..496f66c5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
new file mode 100644
index 00000000..4ea918d6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
new file mode 100644
index 00000000..9205bd31
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
new file mode 100644
index 00000000..dcd41de7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
new file mode 100644
index 00000000..2b57abf3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
new file mode 100644
index 00000000..5b5da42e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
new file mode 100644
index 00000000..c8cc429d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
new file mode 100644
index 00000000..8be3a04d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
new file mode 100644
index 00000000..506ee760
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
new file mode 100644
index 00000000..a9fe1814
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
new file mode 100644
index 00000000..6f5d767a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
new file mode 100644
index 00000000..c6c22a40
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
new file mode 100644
index 00000000..96bccc15
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
new file mode 100644
index 00000000..2fc15ed0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
new file mode 100644
index 00000000..07dbf921
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
new file mode 100644
index 00000000..94492b11
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
new file mode 100644
index 00000000..2cc56ef8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
new file mode 100644
index 00000000..17e28205
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
new file mode 100644
index 00000000..2b3cb863
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
new file mode 100644
index 00000000..ed3e70b2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
new file mode 100644
index 00000000..729d37fa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
new file mode 100644
index 00000000..7003e94c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
new file mode 100644
index 00000000..0ad96085
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
new file mode 100644
index 00000000..f26e8bc6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..523e278d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..6b08a4fc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
new file mode 100644
index 00000000..83244ebb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
new file mode 100644
index 00000000..982f3f08
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
new file mode 100644
index 00000000..25c32369
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
new file mode 100644
index 00000000..a6e431db
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
new file mode 100644
index 00000000..fa9075f5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
new file mode 100644
index 00000000..094f95d0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
new file mode 100644
index 00000000..6ffd6d08
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
new file mode 100644
index 00000000..d70d5e85
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
new file mode 100644
index 00000000..39751188
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
new file mode 100644
index 00000000..03fab6ef
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
new file mode 100644
index 00000000..bb6bfc6f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
new file mode 100644
index 00000000..d57576cd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
new file mode 100644
index 00000000..2c586922
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
new file mode 100644
index 00000000..66b14f7f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
new file mode 100644
index 00000000..aacee467
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
new file mode 100644
index 00000000..72d607fb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
new file mode 100644
index 00000000..14db1ba8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
new file mode 100644
index 00000000..0beccf44
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
new file mode 100644
index 00000000..4d884b63
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
new file mode 100644
index 00000000..ba1fdf61
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
new file mode 100644
index 00000000..21645e77
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
new file mode 100644
index 00000000..74d9f30c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
new file mode 100644
index 00000000..9010995f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
new file mode 100644
index 00000000..15fdad65
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
new file mode 100644
index 00000000..1bcc6a9a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
new file mode 100644
index 00000000..9144805c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
new file mode 100644
index 00000000..0b4adc04
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
new file mode 100644
index 00000000..2f4178f0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
new file mode 100644
index 00000000..c583cf24
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
new file mode 100644
index 00000000..f41d3c27
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
new file mode 100644
index 00000000..c2cafd9b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
new file mode 100644
index 00000000..b1d1de0f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_loglikelihood_world_religions"
-- 
GitLab


From 4f5b72bc21249c267e19667e057b48b6f34f82bc Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 12:22:14 +0000
Subject: [PATCH 035/212] fixed template

---
 lm_eval/tasks/bbh/_generate_configs.py                        | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml   | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml        | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml            | 2 +-
 .../bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml  | 2 +-
 .../bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml | 2 +-
 .../bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml  | 2 +-
 .../tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml  | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml       | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml   | 2 +-
 .../bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml            | 2 +-
 .../flan_cot_fewshot/salient_translation_error_detection.yaml | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml                | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml  | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml    | 2 +-
 .../tracking_shuffled_objects_five_objects.yaml               | 2 +-
 .../tracking_shuffled_objects_seven_objects.yaml              | 2 +-
 .../tracking_shuffled_objects_three_objects.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml           | 2 +-
 lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml  | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml   | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml       | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml           | 2 +-
 .../bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml | 2 +-
 .../flan_cot_zeroshot/logical_deduction_seven_objects.yaml    | 2 +-
 .../flan_cot_zeroshot/logical_deduction_three_objects.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml | 2 +-
 .../tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml             | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml  | 2 +-
 .../flan_cot_zeroshot/reasoning_about_colored_objects.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml           | 2 +-
 .../salient_translation_error_detection.yaml                  | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml               | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml   | 2 +-
 .../tracking_shuffled_objects_five_objects.yaml               | 2 +-
 .../tracking_shuffled_objects_seven_objects.yaml              | 2 +-
 .../tracking_shuffled_objects_three_objects.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml         | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml       | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml        | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml         | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml            | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml                | 2 +-
 .../bbh/flan_fewshot/logical_deduction_five_objects.yaml      | 2 +-
 .../bbh/flan_fewshot/logical_deduction_seven_objects.yaml     | 2 +-
 .../bbh/flan_fewshot/logical_deduction_three_objects.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml  | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/navigate.yaml                  | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml           | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml       | 2 +-
 .../bbh/flan_fewshot/reasoning_about_colored_objects.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml                | 2 +-
 .../bbh/flan_fewshot/salient_translation_error_detection.yaml | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/snarks.yaml                    | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml        | 2 +-
 .../flan_fewshot/tracking_shuffled_objects_five_objects.yaml  | 2 +-
 .../flan_fewshot/tracking_shuffled_objects_seven_objects.yaml | 2 +-
 .../flan_fewshot/tracking_shuffled_objects_three_objects.yaml | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml               | 2 +-
 lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml      | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml         | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml       | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml        | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml           | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml         | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml         | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml               | 2 +-
 .../bbh/flan_zeroshot/logical_deduction_five_objects.yaml     | 2 +-
 .../bbh/flan_zeroshot/logical_deduction_seven_objects.yaml    | 2 +-
 .../bbh/flan_zeroshot/logical_deduction_three_objects.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml                 | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml          | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml      | 2 +-
 .../bbh/flan_zeroshot/reasoning_about_colored_objects.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml               | 2 +-
 .../flan_zeroshot/salient_translation_error_detection.yaml    | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml                   | 4 ++--
 lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml     | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml       | 2 +-
 .../flan_zeroshot/tracking_shuffled_objects_five_objects.yaml | 2 +-
 .../tracking_shuffled_objects_seven_objects.yaml              | 2 +-
 .../tracking_shuffled_objects_three_objects.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml              | 2 +-
 lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml             | 2 +-
 109 files changed, 110 insertions(+), 110 deletions(-)

diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index 8d805a64..ae2fb38f 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -68,7 +68,7 @@ if __name__ == "__main__":
             doc_to_text = doc_to_text + " Let's think step by step.\n"
 
         yaml_dict = {
-                "include": "_template_yaml",
+                "include": base_yaml_name,
                 "task": f"bbh_{args.task_prefix}_{task}",
                 "dataset_name": task,
                 "description": description+"\n\n",
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
index 849e0435..87f6e5af 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
 "doc_to_text": " not ( ( not not True ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not ( ( not not True ) ) = not ( ( A ) )\" where \"A = not not True\".\nLet's evaluate A: A = not not True = not (not True) = not False = True.\nPlugging in A, we get: Z = not ( ( A ) ) = not ( ( True ) ) = not True = False. So the answer is False.  True and False and not True and True is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = True and False and not True and True = A and B\" where \"A = True and False\" and \"B = not True and True\".\nLet's evaluate A: A = True and False = False.\nLet's evaluate B: B = not True and True = not (True and True) = not (True) = False.\nPlugging in A and B, we get: Z = A and B = False and False = False. So the answer is False.  not not ( not ( False ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not not ( not ( False ) ) = not not ( A )\" where \"A = not ( False )\".\nLet's evaluate A: A = not ( False ) = not False = True.\nPlugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
index e7e80969..420320bc 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
 "doc_to_text": " How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild.\" A typical person would assume that this passage suggests that Frank T. had no intention of shooting and injuring someone and that the bullet accidentally hit the neighbor's body; therefore, we conclude that Frank T. did not intentionally hit his neighbor. So the answer is No.  How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that the boss ordered them both to arrive at the meeting room at the same time and that the motion detector was set up to be triggered if at least one person appeared in the room at the same time.\" A typical person would assume that the person probably meant to say the detector was set up to be triggered if \"both persons\" appeared in the room at the same time, not at least one person, since otherwise the phrase \"at the same time\" would not make much sense in that sentence. Because the motion detector went off, a typical person would therefore come to the conclusion that both Suzy and Billy triggered the motion detector to go off; hence, Billy did indeed cause the motion detector to go off. So the answer is Yes.  How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"He aims the dart at the low point region.\" A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
index d5dc2117..c88d8fe0 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
 "doc_to_text": " Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: Let's think step by step.\nIf today is Christmas Eve of 1937, then today's date is December 24, 1937. 10 days before today is December 14, 1937, that is 12/14/1937. So the answer is (D).  Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: Let's think step by step.\nIf tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from today is 11/11/2018. So the answer is (B).  Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: Let's think step by step.\nIf Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
index 2d08474c..8dc5db9e 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
 "doc_to_text": " In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"they\" might be referring to. There are two possible referents for \"they\", namely the chief and the counselor. The verb \"told\" might be able to help us determine which one is more likely (if either). Let X be the chief and Y the counselor. The sentence is then of the form \"X told Y that (X or Y) did something.\"\nLet's consider Y first: \"X told Y that Y did something.\" This case does not make much sense, as Y would already have the information that Y did something, because it is information about themself.\nNow, consider X: \"X told Y that X did something.\" This makes sense, because X would be sharing some information about themself that Y might not have known before.\nBecause in this context, X is the chief and Y is the counselor, the answer should be the chief. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"he\" might be referring to. There are two possible referents for \"he\", namely the manager and the secretary. The verbs \"sent\" and \"reply\" might be able to help us determine which one is more likely (if either). Let X be the manager and Y the secretary. The sentence is then of the form \"X sent a message to Y, but (X or Y) didn't reply yet.\"\nLet's consider Y first: \"X sent a message to Y, but Y didn't reply yet.\" This case makes sense, because of the implicit causality of the sentence. Y was the receiver of the message, but Y didn't get back to X yet.\nNow, consider X: \"X sent a message to Y, but X didn't reply yet.\" This case doesn't make sense, because X was the initial sender of the message, so it is now Y's turn to write back to X.\nBecause in this context, X is the manager and Y is the secretary, the answer should be the secretary. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"his\" might be referring to. There are two possible referents for \"his\", namely Bailey's and the director's. The verb phrase \"plan to meet\" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form \"X will plan to meet Y at (X or Y)'s office.\"\nLet's consider Y first: \"X will plan to meet Y at Y's office.\" This case makes sense, because X might want to meet up with Y at Y's office.\nNow, consider X: \"X will plan to meet Y at X's office.\" This case also makes sense, because X might want to meet up with Y at X's own office.\nBecause both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
index f15bda16..f82f70db 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
 "doc_to_text": " Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: [ ; stack: [\n2: { ; stack: [ {\n3: [ ; stack: [ { [\nNow, we have reached the end. The final stack is \"[ { [\".\nWe will need to pop out \"[\", \"{\", \"[\" one by one in that order.\nSo, we need \"]\", \"}\", \"]\". So the answer is ] } ].  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: > ; stack: empty\n3: ( ; stack: (\n4: ( ; stack: ( (\n5: [ ; stack: ( ( [\n6: [ ; stack: ( ( [ [\n7: ( ; stack: ( ( [ [ (\n8: { ; stack: ( ( [ [ ( {\n9: } ; stack: ( ( [ [ (\n10: ) ; stack: ( ( [ [\n11: [ ; stack: ( ( [ [ [\n12: < ; stack: ( ( [ [ [ <\n13: > ; stack: ( ( [ [ [\n14: ] ; stack: ( ( [ [\n15: ] ; stack: ( ( [\nNow, we have reached the end. The final stack is \"( ( [\".\nWe will need to pop out \"[\", \"(\", \"(\" one by one in that order.\nSo, we need \"]\", \")\", \")\". So the answer is ] ) ).  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: [ ; stack: < [\n3: < ; stack: < [ <\n4: [ ; stack: < [ < [\n5: { ; stack: < [ < [ {\n6: < ; stack: < [ < [ { <\n7: [ ; stack: < [ < [ { < [\n8: ] ; stack: < [ < [ { <\n9: < ; stack: < [ < [ { < <\n10: { ; stack: < [ < [ { < < {\n11: } ; stack: < [ < [ { < <\n12: > ; stack: < [ < [ { <\n13: > ; stack: < [ < [ {\n14: } ; stack: < [ < [\n15: ] ; stack: < [ <\n16: > ; stack: < [\n17: { ; stack: < [ {\n18: { ; stack: < [ { {\n19: ( ; stack: < [ { { (\n20: ) ; stack: < [ { {\n21: } ; stack: < [ {\n22: { ; stack: < [ { {\n23: < ; stack: < [ { { <\n24: [ ; stack: < [ { { < [\n25: < ; stack: < [ { { < [ <\n26: > ; stack: < [ { { < [\n27: ] ; stack: < [ { { <\n28: > ; stack: < [ { {\n29: } ; stack: < [ {\nNow, we have reached the end. The final stack is \"< [ {\".\nWe will need to pop out \"{\", \"[\", \"<\" one by one in that order.\nSo, we need \"}\", \"]\", \">\". So the answer is } ] >.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
index 7e28e84e..bbace29e 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/formal_fallacies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
 "doc_to_text": " \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Lesley is a close friend of Fernando: Lesley = friend(Fernando).\n(2) Being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy: If X = friend(Fernando) OR SCHOOLMATE(Lowell), then X = great-grandfather(Leroy).\nHypothesis: Does it follow that Lesley is a great-grandfather of Leroy: Lesley = great-grandfather(Leroy)?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (1), we have Lesley = friend(Fernando). By (2), we have if Lesley = friend(Fernando), then Lesley = great-grandfather(Leroy).\nSo, it is true that Lesley is a great-grandfather of Leroy. So the answer is valid.  \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: Whoever is not a great-grandfather of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde. We may conclude: Everyone who is an ancestor of Dana is a stepbrother of Brian, too.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Whoever is not a great-grandfather of Clyde is a stepbrother of Brian: If X = NOT (great-grandfather(Clyde)), then X = stepbrother(Brian).\n(2): Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde: If X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\nHypothesis: Does it follow that everyone who is an ancestor of Dana is a stepbrother of Brian, too: If X = ancestor(Dana), then X = stepbrother(Brian)?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (2), we have if X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\nFurthermore, by (1), we have if X = NOT (great-grandfather(Clyde)), then X = stepbrother(Brian).\nBy the transitive relation rule in first-order logic, we then have: if X = ancestor(Dana), then X = stepbrother(Brian).\nSo, it is true that everyone who is an ancestor of Dana is a stepbrother of Brian. So the answer is valid.  \"It is not always easy to grasp who is consuming which products. The following argument pertains to this question: Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and, in the same time, a loyal buyer of Caress soap. It follows that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: Let's think step by step.\n(1) Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both: If X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress).\n(2): No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and a loyal buyer of Caress soap at the same time. If X = regular-consumer(Lush), then X = NOT (rare-consumer(Nioxin) AND loyal-buyer(Caress)).\nHypothesis: Does it follow that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap: If X = infrequent-user(Paul Mitchell), then X = NOT (regular-consumer(Lush))?\nLet’s see whether the Hypothesis can be deduced from the arguments (1) and (2) by logical reasoning?\nBy (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress). We need to consider both cases separately:\nThe case X = rare-consumer(Nioxin) does not appear in (2).\nThe case X = loyal-buyer(Caress) does not appear in (2), either.\nSo, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
index 07844a53..6c3141aa 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/geometric_shapes.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "geometric_shapes"
 "description": "Name geometric shapes from their SVG paths.\n\n"
 "doc_to_text": " This SVG path element <path d=\"M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 9 separate commands.\n(1) M 31.00,73.00: Move the current point to 31.00,73.00.\n(2) L 32.00,59.00: Create a line from 31.00,73.00 to 32.00,59.00.\n(3) L 44.00,50.00: Create a line from 32.00,59.00 to 44.00,50.00.\n(4) L 49.00,41.00: Create a line from 44.00,50.00 to 49.00,41.00.\n(5) L 64.00,37.00: Create a line from 49.00,41.00 to 64.00,37.00.\n(6) L 71.00,55.00: Create a line from 64.00,37.00 to 71.00,55.00.\n(7) L 64.00,76.00: Create a line from 71.00,55.00 to 64.00,76.00.\n(8) L 52.00,61.00: Create a line from 64.00,76.00 to 52.00,61.00.\n(9) L 31.00,73.00: Create a line from 52.00,61.00 to 31.00,73.00.\nThis SVG path starts at point 31.00,73.00, creates eight consecutive and touching lines, and then returns back its starting point, thereby creating an eight-sided shape. It does not have any curves or arches. \"octagon\" is the only eight-sided object on the list. So the answer is (F).  This SVG path element <path d=\"M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 6 separate commands.\n(1) M 14.19,26.04: Move the current point to 14.19,26.04.\n(2) L 51.43,39.21: Create a line from 14.19,26.04 to 51.43,39.21.\n(3) L 58.44,36.69: Create a line from 51.43,39.21 to 58.44,36.69.\n(4) L 56.63,30.17: Create a line from 58.44,36.69 to 56.63,30.17.\n(5) L 48.53,26.66: Create a line from 56.63,30.17 to 48.53,26.66.\n(6) L 14.19,26.04: Create a line from 48.53,26.66 to 14.19,26.04.\nThis SVG path starts at point 14.19,26.04, creates five consecutive and touching lines, and then returns back its starting point, thereby creating a five-sided shape. It does not have any curves or arches. \"pentagon\" is the only five-sided polygon on the list. So the answer is (G).  This SVG path element <path d=\"M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: Let's think step by step.\nThis SVG path element contains \"M\" and \"L\" commands. M takes two parameters (x,y) and moves the current point to the coordinates (x,y). L takes two parameters (x,y) and draws a line from the previous coordinate to the new coordinate (x,y).\nThis path can be decomposed into 5 separate commands.\n(1) M 41.00,43.00: Move the current point to 41.00,43.00.\n(2) L 37.00,34.00: Create a line from 41.00,43.00 to 37.00,34.00.\n(3) L 41.00,33.00: Create a line from 37.00,34.00 to 41.00,33.00.\n(4) L 45.00,34.00: Create a line from 41.00,33.00 to 45.00,34.00.\n(5) L 41.00,43.00: Create a line from 45.00,34.00 to 41.00,43.00.\nThis SVG path starts at point 41.00,43.00, creates four consecutive and touching lines, and then returns back its starting point, thereby creating a four-sided shape. \"kite\" and \"rectangle\" are the only two four-sided polygons on the list. So, we need to determine which one is the correct answer.\nA kite has two pairs of equal-length adjacent sides, whereas a rectangle has two pairs of equal-length alternate (opposite) sides. Now, let's check whether the two adjacent sides of this shape are equal.\nLength of side A: |A| = sqrt((41.00-37.00)^2 + (43.00-34.00)^2) = sqrt((4)^2 + (9)^2) = sqrt(16 + 81) = sqrt(97).\nLength of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 + (1)^2) = sqrt(16 + 1) = sqrt(17).\nLength of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).\nLength of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).\nNote that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is \"kite\". So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
index 2dc997cc..f371f45c 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/hyperbaton.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "hyperbaton"
 "description": "Order adjectives correctly in English sentences.\n\n"
 "doc_to_text": " Which sentence has the correct adjective order:\nOptions:\n(A) rubber terrible ship\n(B) terrible rubber ship\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"rubber terrible ship\". (1) rubber\" falls into the material category. (2) \"terrible\" falls into the opinion category. Option (A) has the following adjective order: [7. material] [1. opinion] (or, in numeric terms, 7 1). Because 7 < 1 is not correct, (A) does not have the correct ordering.\nOption (B): \"terrible rubber ship\". Option (B) has the following adjective order: [1. opinion] [7. material] (or, in numeric terms, 1 7). Because 1 < 7 is correct, (B) has the correct ordering. So the answer is (B).  Which sentence has the correct adjective order:\nOptions:\n(A) repulsive small Brazilian exercise ship\n(B) Brazilian repulsive exercise small ship\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"repulsive small Brazilian exercise ship\". (1) \"repulsive\" falls into the opinion category. (2) \"small\" falls into the size category. (3) \"Brazilian\" falls into the origin category. (4) \"exercise\" falls into the purpose category. Option (A) has the following adjective order: [1. opinion] [2. size] [6. origin] [8. purpose] (or, in numeric terms, 1 2 6 8). Because 1 < 2 < 6 < 8 is correct, (A) has the correct ordering.\nOption (B): \"Brazilian repulsive exercise small ship\". Option (B) has the following adjective order: [6. origin] [1. opinion] [8. purpose] [2. size] (or, in numeric terms, 6 1 8 2). Because 6 < 1 < 8 < 2 is not correct, (B) does not have the correct ordering. So the answer is (A).  Which sentence has the correct adjective order:\nOptions:\n(A) blue gold wonderful square shoe\n(B) wonderful square blue gold shoe\nA: Let's think step by step.\nWhen there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: \"[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun\".\nOption (A): \"blue gold wonderful square shoe\". (1) \"blue\" falls into the color category. (2) \"gold\" falls into the material category. (3) \"wonderful\" falls into the opinion category. (4) \"square\" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.\nOption (B): \"wonderful square blue gold shoe\". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
index b54bac3b..7e6fe33c 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_five_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
index f4799ae6..9f04d580 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
index cbc0dbd1..4affd9b7 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/logical_deduction_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": " The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: Let's think step by step.\n(1) Eve finished above Amy: \"(above) ? Eve ? Amy ? (below)\".\n(2) Eli finished below Amy: \"(above) ? Amy ? Eli ? (below)\".\n(3) Combining (1) and (2) we get the following ordering: \"(above) Eve Amy Eli (below)\".\nAccording to this ordering, the person who finished last (the one at the bottom of this list) is Eli.\nEli finished last. So the answer is (B).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: Let's think step by step.\n(1) The green book is to the right of the white book: \"(left) ? white ? green ? (right)\".\n(2) The orange book is the rightmost: \"(left) ? white ? green orange (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white green orange (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (A).  The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: Let's think step by step.\n(1) The white book is to the left of the gray book: \"(left) ? white ? gray ? (right)\".\n(2) The red book is the second from the left: \"(left) ? white red gray ? (right)\".\n(3) Combining (1) and (2) we get the following ordering: \"(left) white red gray (right)\".\nAccording to this ordering, the leftmost book is the white book.\nThe white book is the leftmost. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
index 573b3222..4ba1454b 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/movie_recommendation.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
 "doc_to_text": " Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:\nOptions:\n(A) Tetsuo\n(B) the Ironman\n(C) The Princess Bride\n(D) The Barkley Marathons The Race That Eats Its Young\n(E) Bug\nA: Let's think step by step.\n- Star Wars Episode IV - A New Hope (action, adventure, fantasy; 1977)\n- Indiana Jones and the Last Crusade (action, adventure; 1989)\n- Star Wars Episode V - The Empire Strikes Back (action, adventure, fantasy; 1980)\n- The Big Lebowski (action, drama, comedy; 1998)\nThese are all famous classic American movies produced before 2000. Amongst all the options, the only movie similar to these ones seems to be The Princess Bride (1987). So the answer is (C).  Find a movie similar to Twister, The Silence of the Lambs, Independence Day, Braveheart:\nOptions:\n(A) They Shoot Horses\n(B) Don't They\n(C) Forrest Gump\n(D) The Salton Sea\n(E) Extreme Days\nA: Let's think step by step.\n- Twister (action, adventure, thriller; 1996)\n- The Silence of the Lambs (crime, drama, thriller; 1991)\n- Independence Day (action, science-fiction, drama; 1996)\n- Braveheart (biography, drama, epic; 1995)\nThese are all famous Hollywood movies produced around the 1990s. Amongst all the options, the only movie similar to these ones seems to be Forrest Gump (comedy, drama, romance; 1994). So the answer is (C).  Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest Gump:\nOptions:\n(A) Phenomena\n(B) Lilting\n(C) Catwoman\n(D) Edge of Tomorrow\nA: Let's think step by step.\n- Minority Report (action, crime, mystery; 2002)\n- Total Recall (action, adventure, science-fiction; 2012)\n- Inside Out (animation, family, comedy; 2015)\n- Forrest Gump (comedy, drama, romance; 1994)\nThese are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
index 21f34e7a..f92d96e7 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/multistep_arithmetic_two.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
 "doc_to_text": " ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A * B\", where A = (-5 + 9 * -4 - 0) and B = (4 + -7 + 0 * -5).\nLet's calculate A = (-5 + 9 * -4 - 0) = (-5 + (9 * -4) - 0) = (-5 + (-36) - 0) = (-5 + -36 - 0) = -5 - 36 = -41.\nLet's calculate B = (4 + -7 + 0 * -5) = (4 + -7 + (0 * -5)) = (4 + -7 + 0) = (4 + -7) = (4 - 7) = -3.\nThen, the final equation is A * B = -41 * -3 = (-61) * (-3) = 123. So the answer is 123.  ((-9 * 7 * 7 * -9) + (4 * -9 - 8 - -4)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A + B\", where A = (-9 * 7 * 7 * -9) and B = (4 * -9 - 8 - -4).\nLet's calculate A = (-9 * 7 * 7 * -9) = ((-9 * 7) * (7 * -9))  = ((-63) * (-63)) = 3969.\nLet's calculate B = (4 * -9 - 8 - (-4)) = ((4 * -9) - 8 - (-4)) = ((-36) - 8 - (-4)) = ((-36 - 8) - (-4)) = (-44 - (-4)) = -40.\nThen, the final equation is A + B = 3969 + -40 = 3969 - 40 = 3929. So the answer is 3929.  ((-3 + 5 * 8 * -4) - (9 - 8 * -7 + -9)) =\nA: Let's think step by step.\nLet’s recall that the order of operations in mathematics is as follows: (1) Parentheses, (2) exponents, (3) multiplication and division (from left to right), (4) addition and multiplication (from left to right). So, remember to always compute the expressions inside parentheses or brackets first.\nThis equation can be written as \"A - B\", where A = (-3 + 5 * 8 * -4) and B = (9 - 8 * -7 + -9).\nLet's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.\nLet's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.\nThen, the final equation is A - B = -163 - 56 = -219. So the answer is -219.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
index be3cd234..37923249 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/navigate.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": " If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Turn left: (0, 0), facing the negative x-axis.\n(2) Turn around: (0, 0), facing the positive x-axis.\n(3) Turn left: (0, 0), facing the positive y-axis.\n(4) Take 7 steps: (0, 7), facing the positive y-axis.\n(5) Take 2 steps: (0, 9), facing the positive y-axis.\n(6) Take 4 steps: (0, 13), facing the positive y-axis.\n(7) Take 8 steps: (0, 21), facing the positive y-axis.\nSince (0, 21) is not (0, 0), we are not where we started. So the answer is No.  If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Turn around: (0, 0), facing the negative y-axis.\n(2) Take 1 step: (0, -1), facing the negative y-axis.\n(3) Take 6 steps: (0, -7), facing the negative y-axis.\n(4) Turn around: (0, -7), facing the positive y-axis.\n(5) Take 6 steps: (0, -1), facing the positive y-axis.\n(6) Take 9 steps: (0, 8), facing the positive y-axis.\n(7) Take 1 step: (0, 9), facing the positive y-axis.\nSince (0, 9) is not (0, 0), we are not where we started. So the answer is No.  If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nWe start at the origin (0, 0), facing the positive y-axis.\n(1) Always face forward: (0, 0), facing the positive y-axis.\n(2) Take 2 steps right: (0, 2), facing the positive y-axis.\n(3) Take 9 steps left: (0, -7), facing the positive y-axis.\n(4) Take 7 steps right: (0, 7), facing the positive y-axis.\nSince (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
index 767d414f..e6f2635b 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/object_counting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": " I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?\nA: Let's think step by step.\nWe first identify the fruits on the list and include their quantity in parentheses:\n- blackberry (1)\n- nectarine (1)\n- plum (1)\n- strawberry (1)\n- banana (1)\n- orange (1)\nNow, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 1 + 1 = 6. So the answer is 6.  I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, a nectarine, and three plums. How many fruits do I have?\nA: Let's think step by step.\nWe first identify the fruits on the list and include their quantity in parentheses:\n- orange (1)\n- raspberry (1)\n- peaches (2)\n- blackberry (1)\n- apple (1)\n- grape (1)\n- nectarine (1)\n- plums (3)\nNow, let's add the numbers in parentheses: 1 + 1 + 2 + 1 + 1 + 1 + 1 + 3 = 11. So the answer is 11.  I have a lettuce head, a head of broccoli, an onion, a stalk of celery, two carrots, a garlic, and a yam. How many vegetables do I have?\nA: Let's think step by step.\nWe first identify the vegetables on the list and include their quantity in parentheses:\n- lettuce (1)\n- broccoli (1)\n- onion (1)\n- celery (1)\n- carrots (2)\n- garlic (1)\n- yam (1)\nNow, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
index 439c5f45..5b7228f4 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/penguins_in_a_table.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": " Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:\nJames, 12, 90, 12\nHow many penguins are less than 8 years old?\nOptions:\n(A) 1\n(B) 2\n(C) 3\n(D) 4\n(E) 5\nA: Let's think step by step.\nThis question focuses on age. We know the following: Louis is 7 years old, Bernard is 5 years old, Vincent is 9 years old, and Gwen is 8 years old.\nNow, we add James to this table: James is 12 years old.\nThe penguins that are less than 8 years old are Louis and Bernard.\nThere are 2 penguins less than 8 years old. So the answer is (B).  Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: Let's think step by step.\nThis question focuses on age. We know the following: Louis is 7 years old, Bernard is 5 years old, Vincent is 9 years old, and Gwen is 8 years old.\nAccording to the table, Bernard (5) is the youngest amongst them.\nThe youngest penguin is Bernard. So the answer is (B).  Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: Let's think step by step.\nThis question focuses on the name. We know the following: The names of the penguin in the table are Louis, Bernard, Vincent, and Gwen.\nWhen we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.\nThe name of the second penguin sorted by alphabetical order is Gwen.\nThe name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
index a93951da..f4ee36e6 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/reasoning_about_colored_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": " On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: Let's think step by step.\nAccording to this question, the color of the stress ball is blue. So the answer is (E).  On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: Let's think step by step.\nAccording to this question, the objects are arranged in a row, from left to right, as follows: (1) a purple paperclip, (2) a pink stress ball, (3) a brown keychain, (4) a green scrunchiephone charger, (5) a mauve fidget spinner, (6) a burgundy pen.\nThe stress ball is the second object on the list, namely (2). The object that is to the right of the stress ball corresponds to (3), which is a brown keychain.\nThe color of the keychain is brown. So the answer is (F).  On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?\nOptions:\n(A) zero\n(B) one\n(C) two\n(D) three\n(E) four\n(F) five\n(G) six\nA: Let's think step by step.\nAccording to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.\nThe teal plate is the first item, namely (1). There is no item to the left of the teal item.\nThe number of non-orange items to the left of the teal item is zero. So the answer is (A).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
index d6a3485b..3b3f7799 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/ruin_names.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": " Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?\nOptions:\n(A) whitesnape\n(B) whitesnapke\n(C) whitesnuake\n(D) mwhitesnake\nA: Let's think step by step.\nThe original name is \"whitesnake\". This is the name of an old English hard rock band. It is a compound word, formed by the words \"white\" and \"snake\".\n(A) \"whitesnape\": It is formed by the combination of \"white\" and \"snake\"; therefore, \"snake\" has been changed to \"snape\". Snape makes a reference to the fictional character Severus Snape in the Harry Potter series, so (A) is indeed a meaningful and funny edit.\n(B) \"whitesnapke\": It is formed by the combination of \"white\" and \"snapke\", but \"snapke\" is not an actual word; therefore, \"whitesnapke\" is not humorous.\n(C) \"whitesnuake\": It is formed by the combination of \"white\" and \"snuake\", but \"snuake\" is not an actual word; therefore, \"whitesnuake\" is not humorous.\n(D) \"mwhitesnake\": It is formed by the combination of \"m\", \"white\", and \"snake\", but the prefix \"-m \"seems arbitrary; therefore, \"mwhitesnake\" is not meaningful or humorous.\nAbove the above, the only humorous edit is (A). So the answer is (A).  Which of the following is a humorous edit of this artist or movie name: 'one of our dinosaurs is missing'?\nOptions:\n(A) ofne of our dinosaurs is missing\n(B) one af our dinosaurs is missing\n(C) one of our dinosaurs is pissing\n(D) one of our dinosaur is missing\nA: Let's think step by step.\nThe original name is \"one of our dinosaurs is missing\". This is the name of an old British movie.\n(A) \"ofne of our dinosaurs is missing\": Here \"one of\" is changed to \"ofne\", but the word \"ofne\" is not an actual word.\n(B) \"one af our dinosaurs is missing\": Here the word \"of\" is changed to \"af\", but the word \"af\" is not an actual word.\n(C) \"one of our dinosaurs is pissing\": Here the word \"missing\" is changed to \"pissing\", and \"one of our dinosaurs is pissing\" is indeed a very whimsical and mischievous edit. This change truly ruins the original title of the movie.\n(D) \"one of our dinosaur is missing\": Here the word \"dinosaurs\" is changed to \"dinosaur\", but \"dinosaur\" is singular but should be plural in the title; this change therefore feels arbitrary and not humorous.\nAbove the above, the only humorous edit is (C).\nAbove the above, the only humorous edit is (C). So the answer is (C).  Which of the following is a humorous edit of this artist or movie name: 'counting crows'?\nOptions:\n(A) countingy crows\n(B) counting cows\n(C) courting crows\n(D) coutnting crows\nA: Let's think step by step.\nThe original name is \"counting crows\". This is the name of an American rock band. Historically, the band name comes from the British nursery rhyme \"One for Sorrow\", which is about counting of magpies.\n(A) \"countingy crows\": Here the word \"counting\" is changed to \"countingy\", but the word \"countingy\" is not an actual word.\n(B) \"counting cows\": Here the word \"crows\" is changed to \"cows\", and this is indeed a playful and meaningful edit that ruins the original name of the band.\n(C) \"courting crows\": Here the word \"counting\" is changed to \"courting\", and \"courting\" is an actual word; however, \"courting crows\" does not sound as humorous as \"counting cows\".\n(D) \"coutnting crows\": Here the word \"counting\" is changed to \"coutnting\", but the word \"coutnting\" is not an actual word.\nAbove the above, the only humorous edit is (B). So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
index 2aa42072..1d4c13a6 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/salient_translation_error_detection.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": " The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.\nTranslation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"The list of monuments in Lenzen (Elbe) includes all the monuments in the Brandenburg town of Lenzen (Elbe) and its districts.\" On the other hand, the provided translation is \"In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\" Note that Lenzen (Elbe) is changed to Lenzen in the original translation; so, there is a named entity error. Because an entity in the original source sentence is changed to a different entity in the translation, the translation contains an error pertaining to Named Entities. So the answer is (D).  The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Auf dieser Seite sind die Baudenkmäler der oberbayerischen Großen Kreisstadt Landsberg am Lech zusammengestellt.\nTranslation: On this page are compiled the architectural monuments of the town of Landsberg am Lech.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"The monuments of the Upper Bavarian district town of Landsberg am Lech are compiled on this page.\" On the other hand, the provided translation is \"On this page are compiled the architectural monuments of the town of Landsberg am Lech.\" Note that an important detail about the location of Landsberg am Lech is omitted in the original translation: The translation should have said \"Upper Bavarian district town of Landsberg am Lech\". Because a significant clause in the translation was removed, the translation contains an error pertaining to Dropped Content. So the answer is (E).  The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Łeba ist eine Kleinstadt und ein Badeort im Powiat Lęborski der polnischen Woiwodschaft Pommern.\nTranslation: Eba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: Let's think step by step.\nWe solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is \"Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship.\" On the other hand, the provided translation is \"Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\" Note that the provided sentence says, \"Łeba is not a small town ...\" However, the translation should have been \"Łeba is a small town ...\" Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
index 0138faa7..060ff768 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/snarks.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "snarks"
 "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": " Which statement is sarcastic?\nOptions:\n(A) Yes, because having interests and actively researching them is a huge waste\n(B) Yes, because having interests and actively researching them is a huge deal\nA: Let's think step by step.\nIf we look at (A), it says that having interests and actively researching them is a huge waste, implying that it is a useless effort. However, we know that having interests and actively researching them is typically not a waste but rather is beneficial to the individual. The presence of such a juxtaposition in (A) suggests that it contains a taste of irony and sarcasm.\nIf we look at (B), it says that having interests and actively researching them is a huge deal, implying that it is an important and consequential effort. This is arguably a neutral and correct statement.\nAbove the above, the sarcastic option is (A). So the answer is (A).  Which statement is sarcastic?\nOptions:\n(A) No one is going to disagree with you on this. Avoiding ad hominem attacks really help your case\n(B) No one is going to disagree with you on this. Ad hominem attacks really help your case\nA: Let's think step by step.\nIf we look at (A), it says that avoiding ad hominem attacks really help your case, implying that ad hominem attacks are adverse and injurious. Because ad hominem attacks are adressed at a person rather than an idea, it is indeed true that avoiding them is often useful and helpful; so, (A) is a neutral (valid and agreeable) statement.\nIf we look at (B), it says that ad hominem attacks really help your case, implying that ad hominem attacks are a positive thing. However, we stated previously that ad hominem attacks are often not useful or constructive. The speaker in this sentence therefore seems to mean the opposite of what they are saying; so, there appears to have a taste of irony and sarcasm in (B).\nAbove the above, the sarcastic option is (B). So the answer is (B).  Which statement is sarcastic?\nOptions:\n(A) Consistency in the league's punishments? What do you think this is supposed to be, politics?\n(B) Consistency in the league's punishments? What do you think this is supposed to be, moral?\nA: Let's think step by step.\nIf we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.\nIf we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.\nAbove the above, the sarcastic option is (A). So the answer is (A).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
index 90c0f191..6f561df9 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/sports_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": " Is the following sentence plausible? \"Bam Adebayo scored a reverse layup in the Western Conference Finals.\"\nA: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.  Is the following sentence plausible? \"Santi Cazorla scored a touchdown.\"\nA: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part of American football and rugby. So the answer is no.  Is the following sentence plausible? \"DeMar DeRozan was called for the goal tend.\"\nA: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
index cff56746..2c55788f 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/temporal_sequences.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": " Today, Emily went to the museum. Between what times could they have gone?\nWe know that:\nEmily woke up at 1pm.\nElizabeth saw Emily reading at the library from 2pm to 4pm.\nJessica saw Emily watching a movie at the theater from 4pm to 5pm.\nLeslie saw Emily waiting at the airport from 5pm to 6pm.\nWilliam saw Emily buying clothes at the mall from 6pm to 7pm.\nThe museum was closed after 7pm.\nBetween what times could Emily have gone to the museum?\nOptions:\n(A) 1pm to 2pm\n(B) 6pm to 7pm\n(C) 5pm to 6pm\n(D) 2pm to 4pm\nA: Let's think step by step.\nWake-up time: 1pm.\n1pm-2pm: free.\n2pm-4pm: reading at the library.\n4pm-5pm: watching a movie at the theater.\n5pm-6pm: waiting at the airport.\n6pm-7pm: buying clothes at the mall.\nThe museum closure time: 7pm.\nThe only time when Emily could have gone to the museum was 1pm to 2pm. So the answer is (A).  Today, Elizabeth went to the amusement park. Between what times could they have gone?\nWe know that:\nElizabeth woke up at 7am.\nDavid saw Elizabeth fixing their computer at the electronic store from 1pm to 2pm.\nSarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.\nSusan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.\nAndrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.\nEmily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.\nThe amusement park was closed after 10pm.\nBetween what times could Elizabeth have gone to the amusement park?\nOptions:\n(A) 7am to 1pm\n(B) 9pm to 10pm\n(C) 1pm to 2pm\n(D) 3pm to 6pm\nA: Let's think step by step.\nWake-up time: 7am.\n7am-1pm: free.\n1pm-2pm: fixing their computer at the electronic store.\n2pm-3pm: playing tennis at the tennis court.\n3pm-6pm: walking towards the Statue of Liberty.\n6pm-9pm: taking photos near the Eiffel Tower.\n9pm-10pm: getting a coffee at the cafe.\nThe amusement park closure time: 10pm.\nThe only time when Elizabeth could have gone to the amusement park was 7am to 1pm. So the answer is (A).  Today, Tiffany went to the beach. Between what times could they have gone?\nWe know that:\nTiffany woke up at 5am.\nBetty saw Tiffany getting a coffee at the cafe from 5am to 6am.\nJessica saw Tiffany working at the office from 6am to 9am.\nJohn saw Tiffany stretching at a yoga studio from 9am to 12pm.\nSean saw Tiffany sitting on a rooftop from 12pm to 2pm.\nSarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.\nThe beach was closed after 4pm.\nBetween what times could Tiffany have gone to the beach?\nOptions:\n(A) 9am to 12pm\n(B) 12pm to 2pm\n(C) 5am to 6am\n(D) 3pm to 4pm\nA: Let's think step by step.\nWake-up time: 5am.\n5am-6am: getting a coffee at the cafe.\n6am-9am: working at the office.\n9am-12pm: stretching at a yoga studio.\n12pm-2pm: sitting on a rooftop.\n2pm-3pm: playing tennis at the tennis court.\n3pm-4pm: free.\nThe beach closure time: 4pm.\nThe only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
index a4fd3995..415ad429 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
index 28ff5389..fbcffccf 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
index 84526656..320b31f2 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/tracking_shuffled_objects_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": " Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: yellow, Bob: blue, Claire: pink.\n(1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.\n(2)  Alice and Bob swap balls: Alice: blue, Bob: pink, Claire: yellow.\n(3) Claire and Bob swap balls: Alice: blue, Bob: yellow, Claire: pink.\nAt the end of the game, Bob has the yellow ball. So the answer is (A).  Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: Let's think step by step.\n(0) At the start: Alice: white, Bob: purple, Claire: pink.\n(1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.\n(2) Bob and Claire swap balls: Alice: purple, Bob: pink, Claire: white.\n(3) Bob and Alice swap balls: Alice: pink, Bob: purple, Claire: white.\nAt the end of the game, Alice has the pink ball. So the answer is (C).  Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: Let's think step by step.\n(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.\n(1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.\n(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.\n(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.\nAt the end of the dance, Alice is dancing with Patrick. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
index 69857561..755ce9f9 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/web_of_lies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": " Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?\nA: Let's think step by step.\n(1) Fidel tells the truth. So, we know that Fidel tells the truth.\n(2) Jerry says Fidel tells the truth. Since we know from (1) that Fidel tells the truth, if Jerry says that Fidel tells the truth, then Jerry tells the truth.\n(3) Vina says Jerry tells the truth. Since we know from (2) that Jerry tells the truth, if Vina says Jerry tells the truth, then Vine tells the truth.\n(4) Millicent says Vina lies. Since we know from (3) that Vina tells the truth, if Millicent says Vina lies, then Millicent lies.\n(5) Raymond says Millicent lies. Since we know from (4) that Millicent lies, if Raymond says Millicent lies, then Raymond tells the truth.\nNow, the question asks: Does Raymond tell the truth? We know from (5) that Raymond tells the truth. So the answer is Yes.  Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell the truth?\nA: Let's think step by step.\n(1) Kristian lies. So, we know that Kristian lies.\n(2) Millie says Kristian lies. Since we know from (1) that Kristian lies, if Millie says Kristian lies, then Millie tells the truth.\n(3) Maybelle says Millie tells the truth. Since we know from (2) that Millie tells the truth, if Maybelle says Millie tells the truth, then Maybelle tells the truth.\n(4) Fidel says Maybelle lies. Since we know from (3) that Maybelle tells the truth, if Fidel says Maybelle lies, then Fidel lies.\n(5) Leda says Fidel lies. Since we know from (4) that Fidel lies, if Leda says Fidel lies, then Leda tells the truth.\nNow, the question asks: Does Leda tell the truth? We know from (5) that Leda tells the truth. So the answer is Yes.  Question: Kristian tells the truth. Michaela says Kristian lies. Raymond says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says Osvaldo tells the truth. Does Jamey tell the truth?\nA: Let's think step by step.\n(1) Kristian tells the truth. So, we know that Kristian tells the truth.\n(2) Michaela says Kristian lies. Since we know from (1) that Kristian tells the truth, if Michaela says Kristian lies, then Michaela lies.\n(3) Raymond says Michaela tells the truth. Since we know from (2) that Michaela lies, if Raymond says Michaela tells the truth, then Raymond lies.\n(4) Osvaldo says Raymond tells the truth. Since we know from (3) that Raymond lies, if Osvaldo says Raymond tells the truth, then Osvaldo lies.\n(5) Jamey says Osvaldo tells the truth. Since we know from (4) that Osvaldo lies, if Jamey says Osvaldo tells the truth, then Jamey lies.\nNow, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml
index 0e2e1a88..d6b9f1bc 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/word_sorting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": " Sort the following words alphabetically: List: oven costume counterpart\nA: Let's think step by step.\nThe first letter: \"oven\": \"o\" (15). \"costume\": \"c\" (3). \"counterpart\": \"c\" (3). We now have: (3) [\"costume\" ? \"counterpart\"] < (15) \"oven\". Now let's sort this subpart [\"costume\" ? \"counterpart\"] by looking at their second letters.\nThe second letter: \"costume\": \"o\" (15). \"counterpart\": \"o\" (15). We now have: (15) [\"costume\" ? \"counterpart\"]. Now let's sort this subpart [\"costume\" ? \"counterpart\"] by looking at their third letters.\nThe third letter: \"costume\": \"s\" (19). \"counterpart\": \"u\" (21). We now have: (19) \"costume\" < (21) \"counterpart\". Hence, we have [\"costume\" < \"counterpart\"] < \"oven\". So the answer is costume counterpart oven.  Sort the following words alphabetically: List: hypochlorite ponderosa phone credulity\nA: Let's think step by step.\nThe first letter: \"hypochlorite\": \"h\" (8). \"ponderosa\": \"p\" (16). \"phone\": \"p\" (16). \"credulity\": \"c\" (3). We now have: (3) \"credulity\" < (8) \"hypochlorite\" < (16) [\"ponderosa\" ? \"phone\"]. Now let's sort this subpart [\"ponderosa\" ? \"phone\"] by looking at their second letters.\nThe second letter: \"ponderosa\": \"o\" (15). \"phone\": \"h\" (8). We now have: (8) \"phone\" < (15) \"ponderosa\". Hence, we have \"credulity\" < \"hypochlorite\" < [\"phone\" <\"ponderosa\"]. So the answer is credulity hypochlorite phone ponderosa.  Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census\nA: Let's think step by step.\nThe first letter: \"newt\": \"n\" (14). \"arson\": \"a\" (1). \"parthia\": \"p\" (16). \"seismography\": \"s\" (19). \"mugho\": \"m\" (13). \"aspect\": \"a\" (1). \"census\": \"c\" (3). We now have: (1) [\"arson\" ? \"aspect\"] < (3) \"census\" < (13) \"mugho\" < (14) \"newt\" < (16) \"parthia\" < (19) \"seismography\". Now let's sort this subpart [\"arson\" ? \"aspect\"] by looking at their second letters.\nThe second letter: \"arson\": \"r\" (18). \"aspect\": \"s\" (19). We now have: (18) \"arson\" < (19) \"aspect\". Hence, we have [\"arson\" < \"aspect\"] < \"census\" < \"mugho\" < \"newt\" < \"parthia\" < \"seismography\". So the answer is arson aspect census mugho newt parthia seismography.Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_fewshot_template_yaml"
 "task": "bbh_flan_cot_fewshot_word_sorting"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
index 04a6d1e5..7ac62ea6 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/boolean_expressions.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
index 73ed31c8..39fc39b1 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/causal_judgement.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
index 11ad75db..94eb0f2c 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/date_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
index b8b13d78..cb6946db 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/disambiguation_qa.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
index 5995b6c4..c09dd0e8 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/dyck_languages.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
index 6b029e7e..28ca9fb3 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/formal_fallacies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
index acb91aa4..6de27bce 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/geometric_shapes.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "geometric_shapes"
 "description": "Name geometric shapes from their SVG paths.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
index dbe1280b..3fd6d246 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/hyperbaton.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "hyperbaton"
 "description": "Order adjectives correctly in English sentences.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
index 5592252a..6ab7c57e 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_five_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
index c85b9d21..5c7e22df 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
index e94f8e4a..6337dfd1 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
index 038119fc..0bcb95c9 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
index c21c1b8c..7cb710ea 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
index c8ea0681..d534c729 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
index 6d6a4721..3457954a 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
index c5501700..fa03b5b7 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
index 26789385..7cc6d7f6 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
index 3289b750..17d4c32d 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
index c8113e62..58f6b0c2 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
index b9da41c7..b31a8f94 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "snarks"
 "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
index dbf21164..0a44ba92 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
index 84db7993..573f5e43 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
index 4b6ec1ad..e1b7d1c2 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
index 99dbcc33..17e5dcbc 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
index 4f9ff8e7..8c3d7c4d 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
index 5304cdfc..adec5f3c 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
index 62f0a6aa..50276209 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_template_yaml"
+"include": "_flan_cot_zeroshot_template_yaml"
 "task": "bbh_flan_cot_zeroshot_word_sorting"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
index 19d24f3f..35436bd7 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
 "doc_to_text": "Q: not ( ( not not True ) ) is\nA: False\n\nQ: True and False and not True and True is\nA: False\n\nQ: not not ( not ( False ) ) is\nA: True\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
index b9dd8f6e..5857204c 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
 "doc_to_text": "Q: How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: No\n\nQ: How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
index 5ed01c22..8b0937cf 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
 "doc_to_text": "Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: (D)\n\nQ: Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: (B)\n\nQ: Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: (B)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
index 0c04056f..bfa3f5d1 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
 "doc_to_text": "Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
index 84e308cf..2bfca6d2 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
 "doc_to_text": "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: ] } ]\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: ] ) )\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: } ] >\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
index c91769a5..a420a34c 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/formal_fallacies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
 "doc_to_text": "Q: \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: valid\n\nQ: \"It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: Whoever is not a great-grandfather of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient for not being a great-grandfather of Clyde. We may conclude: Everyone who is an ancestor of Dana is a stepbrother of Brian, too.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: valid\n\nQ: \"It is not always easy to grasp who is consuming which products. The following argument pertains to this question: Every infrequent user of Paul Mitchell shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo and, in the same time, a loyal buyer of Caress soap. It follows that whoever is an infrequent user of Paul Mitchell shampoo is not a regular consumer of Lush soap.\"\nIs the argument, given the explicitly stated premises, deductively valid or invalid?\nOptions:\n- valid\n- invalid\nA: invalid\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
index 1dbb242f..75a72a11 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/geometric_shapes.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "geometric_shapes"
 "description": "Name geometric shapes from their SVG paths.\n\n"
 "doc_to_text": "Q: This SVG path element <path d=\"M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (F)\n\nQ: This SVG path element <path d=\"M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (G)\n\nQ: This SVG path element <path d=\"M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00\"/> draws a\nOptions:\n(A) circle\n(B) heptagon\n(C) hexagon\n(D) kite\n(E) line\n(F) octagon\n(G) pentagon\n(H) rectangle\n(I) sector\n(J) triangle\nA: (D)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
index 090865b2..d1007dd0 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/hyperbaton.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "hyperbaton"
 "description": "Order adjectives correctly in English sentences.\n\n"
 "doc_to_text": "Q: Which sentence has the correct adjective order:\nOptions:\n(A) rubber terrible ship\n(B) terrible rubber ship\nA: (B)\n\nQ: Which sentence has the correct adjective order:\nOptions:\n(A) repulsive small Brazilian exercise ship\n(B) Brazilian repulsive exercise small ship\nA: (A)\n\nQ: Which sentence has the correct adjective order:\nOptions:\n(A) blue gold wonderful square shoe\n(B) wonderful square blue gold shoe\nA: (B)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
index 67f2c1a7..68318d84 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_five_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
index 47593a0d..e6d2a205 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
index 7264e653..ebed792e 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/logical_deduction_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.\nOptions:\n(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\nA: (B)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a white book, a green book, and an orange book. The green book is to the right of the white book. The orange book is the rightmost.\nOptions:\n(A) The white book is the leftmost\n(B) The green book is the leftmost\n(C) The orange book is the leftmost\nA: (A)\n\nQ: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a shelf, there are three books: a red book, a gray book, and a white book. The white book is to the left of the gray book. The red book is the second from the left.\nOptions:\n(A) The red book is the leftmost\n(B) The gray book is the leftmost\n(C) The white book is the leftmost\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
index 8fb208b0..9db0e3e6 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/movie_recommendation.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
 "doc_to_text": "Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:\nOptions:\n(A) Tetsuo\n(B) the Ironman\n(C) The Princess Bride\n(D) The Barkley Marathons The Race That Eats Its Young\n(E) Bug\nA: (C)\n\nQ: Find a movie similar to Twister, The Silence of the Lambs, Independence Day, Braveheart:\nOptions:\n(A) They Shoot Horses\n(B) Don't They\n(C) Forrest Gump\n(D) The Salton Sea\n(E) Extreme Days\nA: (C)\n\nQ: Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest Gump:\nOptions:\n(A) Phenomena\n(B) Lilting\n(C) Catwoman\n(D) Edge of Tomorrow\nA: (D)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
index ba5f65ca..36c3adbd 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/multistep_arithmetic_two.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
 "doc_to_text": "Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =\nA: 123\n\nQ: ((-9 * 7 * 7 * -9) + (4 * -9 - 8 - -4)) =\nA: 3929\n\nQ: ((-3 + 5 * 8 * -4) - (9 - 8 * -7 + -9)) =\nA: -219\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml b/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
index 0aba5820..e896d245 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/navigate.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": "Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\nOptions:\n- Yes\n- No\nA: No\n\nQ: If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\nOptions:\n- Yes\n- No\nA: No\n\nQ: If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
index 7aa27a38..ddfc07e0 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/object_counting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": "Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?\nA: 6\n\nQ: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, a nectarine, and three plums. How many fruits do I have?\nA: 11\n\nQ: I have a lettuce head, a head of broccoli, an onion, a stalk of celery, two carrots, a garlic, and a yam. How many vegetables do I have?\nA: 8\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
index f91d9c98..7c7087d1 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/penguins_in_a_table.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:\nJames, 12, 90, 12\nHow many penguins are less than 8 years old?\nOptions:\n(A) 1\n(B) 2\n(C) 3\n(D) 4\n(E) 5\nA: (B)\n\nQ: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: (B)\n\nQ: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?\nOptions:\n(A) Louis\n(B) Bernard\n(C) Vincent\n(D) Gwen\n(E) James\nA: (D)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
index d03dcd07..02422c1b 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/reasoning_about_colored_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": "Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: (E)\n\nQ: On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?\nOptions:\n(A) red\n(B) orange\n(C) yellow\n(D) green\n(E) blue\n(F) brown\n(G) magenta\n(H) fuchsia\n(I) mauve\n(J) teal\n(K) turquoise\n(L) burgundy\n(M) silver\n(N) gold\n(O) black\n(P) grey\n(Q) purple\n(R) pink\nA: (F)\n\nQ: On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?\nOptions:\n(A) zero\n(B) one\n(C) two\n(D) three\n(E) four\n(F) five\n(G) six\nA: (A)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
index d12013db..b9c9b7b5 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/ruin_names.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": "Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?\nOptions:\n(A) whitesnape\n(B) whitesnapke\n(C) whitesnuake\n(D) mwhitesnake\nA: (A)\n\nQ: Which of the following is a humorous edit of this artist or movie name: 'one of our dinosaurs is missing'?\nOptions:\n(A) ofne of our dinosaurs is missing\n(B) one af our dinosaurs is missing\n(C) one of our dinosaurs is pissing\n(D) one of our dinosaur is missing\nA: (C)\n\nQ: Which of the following is a humorous edit of this artist or movie name: 'counting crows'?\nOptions:\n(A) countingy crows\n(B) counting cows\n(C) courting crows\n(D) coutnting crows\nA: (B)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
index bfbcfa35..81be1ce6 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/salient_translation_error_detection.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": "Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.\nTranslation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (D)\n\nQ: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Auf dieser Seite sind die Baudenkmäler der oberbayerischen Großen Kreisstadt Landsberg am Lech zusammengestellt.\nTranslation: On this page are compiled the architectural monuments of the town of Landsberg am Lech.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (E)\n\nQ: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: Łeba ist eine Kleinstadt und ein Badeort im Powiat Lęborski der polnischen Woiwodschaft Pommern.\nTranslation: Eba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml b/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
index 375da1a2..803ead1a 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/snarks.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "snarks"
 "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": "Q: Which statement is sarcastic?\nOptions:\n(A) Yes, because having interests and actively researching them is a huge waste\n(B) Yes, because having interests and actively researching them is a huge deal\nA: (A)\n\nQ: Which statement is sarcastic?\nOptions:\n(A) No one is going to disagree with you on this. Avoiding ad hominem attacks really help your case\n(B) No one is going to disagree with you on this. Ad hominem attacks really help your case\nA: (B)\n\nQ: Which statement is sarcastic?\nOptions:\n(A) Consistency in the league's punishments? What do you think this is supposed to be, politics?\n(B) Consistency in the league's punishments? What do you think this is supposed to be, moral?\nA: (A)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
index f72cdba9..25a321f8 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/sports_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": "Q: Is the following sentence plausible? \"Bam Adebayo scored a reverse layup in the Western Conference Finals.\"\nA: yes\n\nQ: Is the following sentence plausible? \"Santi Cazorla scored a touchdown.\"\nA: no\n\nQ: Is the following sentence plausible? \"DeMar DeRozan was called for the goal tend.\"\nA: yes\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
index 0f5e5380..2cf283fe 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/temporal_sequences.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": "Q: Today, Emily went to the museum. Between what times could they have gone?\nWe know that:\nEmily woke up at 1pm.\nElizabeth saw Emily reading at the library from 2pm to 4pm.\nJessica saw Emily watching a movie at the theater from 4pm to 5pm.\nLeslie saw Emily waiting at the airport from 5pm to 6pm.\nWilliam saw Emily buying clothes at the mall from 6pm to 7pm.\nThe museum was closed after 7pm.\nBetween what times could Emily have gone to the museum?\nOptions:\n(A) 1pm to 2pm\n(B) 6pm to 7pm\n(C) 5pm to 6pm\n(D) 2pm to 4pm\nA: (A)\n\nQ: Today, Elizabeth went to the amusement park. Between what times could they have gone?\nWe know that:\nElizabeth woke up at 7am.\nDavid saw Elizabeth fixing their computer at the electronic store from 1pm to 2pm.\nSarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.\nSusan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.\nAndrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.\nEmily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.\nThe amusement park was closed after 10pm.\nBetween what times could Elizabeth have gone to the amusement park?\nOptions:\n(A) 7am to 1pm\n(B) 9pm to 10pm\n(C) 1pm to 2pm\n(D) 3pm to 6pm\nA: (A)\n\nQ: Today, Tiffany went to the beach. Between what times could they have gone?\nWe know that:\nTiffany woke up at 5am.\nBetty saw Tiffany getting a coffee at the cafe from 5am to 6am.\nJessica saw Tiffany working at the office from 6am to 9am.\nJohn saw Tiffany stretching at a yoga studio from 9am to 12pm.\nSean saw Tiffany sitting on a rooftop from 12pm to 2pm.\nSarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.\nThe beach was closed after 4pm.\nBetween what times could Tiffany have gone to the beach?\nOptions:\n(A) 9am to 12pm\n(B) 12pm to 2pm\n(C) 5am to 6am\n(D) 3pm to 4pm\nA: (D)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
index 112ede19..f8b085dc 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
index 83821c54..3c2f3ca4 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
index afee9bb9..ef406244 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/tracking_shuffled_objects_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the\nOptions:\n(A) yellow ball\n(B) blue ball\n(C) pink ball\nA: (A)\n\nQ: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.\nAs the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the\nOptions:\n(A) white ball\n(B) purple ball\n(C) pink ball\nA: (C)\n\nQ: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Lola\n(B) Rodrigo\n(C) Patrick\nA: (C)\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
index 2f1c5686..23427364 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": "Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?\nA: Yes\n\nQ: Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell the truth?\nA: Yes\n\nQ: Question: Kristian tells the truth. Michaela says Kristian lies. Raymond says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says Osvaldo tells the truth. Does Jamey tell the truth?\nA: No\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
index 43b7d43d..9ef1a093 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": "Q: Sort the following words alphabetically: List: oven costume counterpart\nA: costume counterpart oven\n\nQ: Sort the following words alphabetically: List: hypochlorite ponderosa phone credulity\nA: credulity hypochlorite phone ponderosa\n\nQ: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census\nA: arson aspect census mugho newt parthia seismography\n\nQ: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_fewshot_template_yaml"
 "task": "bbh_flan_fewshot_word_sorting"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml b/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
index 7098c7e3..5cfef9fe 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_boolean_expressions"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml b/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
index 953419bd..a09d701f 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_causal_judgement"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml b/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
index 99255c90..77142f4a 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_date_understanding"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml b/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
index 65c515cc..8b19c2b2 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_disambiguation_qa"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml b/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
index 10b87a70..6ca7a8a3 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_dyck_languages"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml b/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
index 7fcf6920..b5622c57 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_formal_fallacies"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml b/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
index ee6082b9..abae12e6 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "geometric_shapes"
 "description": "Name geometric shapes from their SVG paths.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_geometric_shapes"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml b/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
index 3e82c854..8cb072b8 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "hyperbaton"
 "description": "Order adjectives correctly in English sentences.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_hyperbaton"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
index 1e4adeb4..e4a4dc9e 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_five_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_logical_deduction_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
index 910ca139..bb96b78c 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_logical_deduction_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
index 405cf023..2244b5b3 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_logical_deduction_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml b/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
index 54dc45f3..b735bb76 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_movie_recommendation"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml b/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
index 494b94fe..3db4c192 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_multistep_arithmetic_two"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
index 3f107003..979b7269 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_navigate"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
index 8e1a675e..131f3418 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_object_counting"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
index c3c4138c..234b8392 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_penguins_in_a_table"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
index bbe01119..73f5772d 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_reasoning_about_colored_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml b/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
index b43e9414..96c1ab6e 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_ruin_names"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml b/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
index 43ebe9f0..fcc7bad3 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_salient_translation_error_detection"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml b/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
index df46e580..17fa9673 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "snarks"
-"description": "Determine which of two sentences is sarcastic.\n\n"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_snarks"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml b/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
index fdbc3287..0c1edbd9 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_sports_understanding"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml b/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
index 4a526778..c9cea8b0 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_temporal_sequences"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
index 39d96c56..c28d374f 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_tracking_shuffled_objects_five_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
index c1f42e8f..ed47f69b 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_tracking_shuffled_objects_seven_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
index 0e02323d..348fc541 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_tracking_shuffled_objects_three_objects"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml b/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
index 179aab6a..d886cd26 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_web_of_lies"
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml b/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
index 9317b875..db12d15b 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
@@ -1,5 +1,5 @@
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_template_yaml"
+"include": "_flan_zeroshot_template_yaml"
 "task": "bbh_flan_zeroshot_word_sorting"
-- 
GitLab


From ec03783fde292bd79c8f4df3c80e280756ee3994 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 16:07:19 +0000
Subject: [PATCH 036/212] update for held-in tasks

---
 .../flan/prompt_templates/flan_anli.yaml      | 36 +++++++++----------
 .../flan/prompt_templates/flan_arc.yaml       | 15 ++++----
 .../flan/yaml_templates/cot_template_yaml     |  2 --
 .../flan/yaml_templates/held_in_template_yaml |  2 --
 lm_eval/benchmarks/flan_anli.yaml             | 17 +++++++++
 lm_eval/benchmarks/flan_boolq.yaml            |  7 ++++
 lm_eval/benchmarks/flan_held_in.yaml          | 14 ++++----
 lm_eval/benchmarks/flan_rte.yaml              |  7 ++++
 8 files changed, 64 insertions(+), 36 deletions(-)
 create mode 100644 lm_eval/benchmarks/flan_anli.yaml
 create mode 100644 lm_eval/benchmarks/flan_boolq.yaml
 create mode 100644 lm_eval/benchmarks/flan_rte.yaml

diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
index 9b9f6705..6ff78840 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
@@ -1,29 +1,29 @@
 # Flan Prompt Templates
 prompts:
   "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
   "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
diff --git a/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml b/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
index c9135a51..4ee34e65 100644
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
@@ -2,23 +2,22 @@
 prompts:
   "template-0":
     doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-1":
     doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-2":
     doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-3":
     doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-4":
     doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-5":
     doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
   "template-6":
     doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
-
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
diff --git a/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml b/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
index 0cb0d16e..cbd40849 100644
--- a/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
@@ -6,8 +6,6 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
 generation_kwargs:
   until:
     - "\n\n"
diff --git a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
index 2f4a4c84..e09daca2 100644
--- a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -4,8 +4,6 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
 generation_kwargs:
   until:
     - "</s>"
diff --git a/lm_eval/benchmarks/flan_anli.yaml b/lm_eval/benchmarks/flan_anli.yaml
new file mode 100644
index 00000000..d6201bb7
--- /dev/null
+++ b/lm_eval/benchmarks/flan_anli.yaml
@@ -0,0 +1,17 @@
+group: flan_anli
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r1
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r2
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r3
diff --git a/lm_eval/benchmarks/flan_boolq.yaml b/lm_eval/benchmarks/flan_boolq.yaml
new file mode 100644
index 00000000..f7ca4796
--- /dev/null
+++ b/lm_eval/benchmarks/flan_boolq.yaml
@@ -0,0 +1,7 @@
+group: flan_boolq
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: flan/prompt_templates/flan_boolq.yaml:*
+    validation_split: validation
diff --git a/lm_eval/benchmarks/flan_held_in.yaml b/lm_eval/benchmarks/flan_held_in.yaml
index f6d62f03..f1965c9d 100644
--- a/lm_eval/benchmarks/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -26,12 +26,14 @@ task:
     use_prompt: flan/prompt_templates/flan_anli.yaml:*
     validation_split: dev_r3
   - include: flan/yaml_templates/held_in_template_yaml
-    task: ai2_arc
-    dataset_path: ARC-Easy
-    use_prompt: local:*
+    task: arc_easy
+    dataset_path: ai2_arc
+    dataset_name: ARC-Easy
+    use_prompt: flan/prompt_templates/flan_arc.yaml:*
     validation_split: validation
   - include: flan/yaml_templates/held_in_template_yaml
-    task: ai2_arc
-    dataset_path: ARC-Challange
-    use_prompt: local:*
+    task: arc_challenge
+    dataset_path: ai2_arc
+    dataset_name: ARC-Challenge
+    use_prompt: flan/prompt_templates/flan_arc.yaml:*
     validation_split: validation
diff --git a/lm_eval/benchmarks/flan_rte.yaml b/lm_eval/benchmarks/flan_rte.yaml
new file mode 100644
index 00000000..cf5832bf
--- /dev/null
+++ b/lm_eval/benchmarks/flan_rte.yaml
@@ -0,0 +1,7 @@
+group: flan_rte
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: flan/prompt_templates/flan_rte.yaml:*
+    validation_split: validation
-- 
GitLab


From 8c39bfc7038511001f26fa14261716ce988d8591 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 4 Sep 2023 16:42:48 +0000
Subject: [PATCH 037/212] add codexglue

---
 lm_eval/tasks/code_x_glue/code-text/bleu.py   | 213 ++++++++++++++++++
 lm_eval/tasks/code_x_glue/code-text/go.yaml   |  19 ++
 lm_eval/tasks/code_x_glue/code-text/java.yaml |  19 ++
 .../code_x_glue/code-text/javascript.yaml     |  19 ++
 lm_eval/tasks/code_x_glue/code-text/php.yaml  |  19 ++
 .../tasks/code_x_glue/code-text/python.yaml   |  19 ++
 lm_eval/tasks/code_x_glue/code-text/ruby.yaml |  19 ++
 lm_eval/tasks/code_x_glue/code-text/utils.py  |  14 ++
 8 files changed, 341 insertions(+)
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/bleu.py
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/go.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/java.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/javascript.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/php.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/python.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/ruby.yaml
 create mode 100644 lm_eval/tasks/code_x_glue/code-text/utils.py

diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py
new file mode 100644
index 00000000..50243474
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -0,0 +1,213 @@
+#!/usr/bin/python
+
+'''
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
+'''
+
+# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+
+'''Provides:
+
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+score_cooked(alltest, n=4): Score a list of cooked test sentences.
+
+score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+
+The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+'''
+
+import sys, math, re, xml.sax.saxutils
+import subprocess
+import os
+
+# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+nonorm = 0
+
+preserve_case = False
+eff_ref_len = "shortest"
+
+normalize1 = [
+    ('<skipped>', ''),         # strip "skipped" tags
+    (r'-\n', ''),              # strip end-of-line hyphenation and join lines
+    (r'\n', ' '),              # join lines
+#    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+]
+normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+
+normalize2 = [
+    (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
+    (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
+    (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
+    (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
+]
+normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+
+def normalize(s):
+    '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
+    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+    if (nonorm):
+        return s.split()
+    if type(s) is not str:
+        s = " ".join(s)
+    # language-independent part:
+    for (pattern, replace) in normalize1:
+        s = re.sub(pattern, replace, s)
+    s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
+    # language-dependent part (assuming Western languages):
+    s = " %s " % s
+    if not preserve_case:
+        s = s.lower()         # this might not be identical to the original
+    for (pattern, replace) in normalize2:
+        s = re.sub(pattern, replace, s)
+    return s.split()
+
+def count_ngrams(words, n=4):
+    counts = {}
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] = counts.get(ngram, 0)+1
+    return counts
+
+def cook_refs(refs, n=4):
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+    
+    refs = [normalize(ref) for ref in refs]
+    maxcounts = {}
+    for ref in refs:
+        counts = count_ngrams(ref, n)
+        for (ngram,count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    return ([len(ref) for ref in refs], maxcounts)
+
+def cook_test(test, item, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    (reflens, refmaxcounts)=item
+    test = normalize(test)
+    result = {}
+    result["testlen"] = len(test)
+
+    # Calculate effective reference sentence length.
+    
+    if eff_ref_len == "shortest":
+        result["reflen"] = min(reflens)
+    elif eff_ref_len == "average":
+        result["reflen"] = float(sum(reflens))/len(reflens)
+    elif eff_ref_len == "closest":
+        min_diff = None
+        for reflen in reflens:
+            if min_diff is None or abs(reflen-len(test)) < min_diff:
+                min_diff = abs(reflen-len(test))
+                result['reflen'] = reflen
+
+    result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
+
+    result['correct'] = [0]*n
+    counts = count_ngrams(test, n)
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+
+    return result
+
+def score_cooked(allcomps, n=4, ground=0, smooth=1):
+    totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+    for comps in allcomps:
+        for key in ['testlen','reflen']:
+            totalcomps[key] += comps[key]
+        for key in ['guess','correct']:
+            for k in range(n):
+                totalcomps[key][k] += comps[key][k]
+    logbleu = 0.0
+    all_bleus = []
+    for k in range(n):
+      correct = totalcomps['correct'][k]
+      guess = totalcomps['guess'][k]
+      addsmooth = 0
+      if smooth == 1 and k > 0:
+        addsmooth = 1
+      logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
+      if guess == 0:
+        all_bleus.append(-10000000)
+      else:
+        all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
+
+    logbleu /= float(n)
+    all_bleus.insert(0, logbleu)
+
+    brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
+    for i in range(len(all_bleus)):
+      if i ==0:
+        all_bleus[i] += brevPenalty
+      all_bleus[i] = math.exp(all_bleus[i])
+    return all_bleus
+
+def bleu(refs,  candidate, ground=0, smooth=1):
+    refs = cook_refs(refs)
+    test = cook_test(candidate, refs)
+    return score_cooked([test], ground=ground, smooth=smooth)
+
+def splitPuncts(line):
+  return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
+
+def computeMaps(predictions, goldfile):
+  predictionMap = {}
+  goldMap = {}
+  gf = open(goldfile, 'r')
+
+  for row in predictions:
+    cols = row.strip().split('\t')
+    if len(cols) == 1:
+      (rid, pred) = (cols[0], '') 
+    else:
+      (rid, pred) = (cols[0], cols[1]) 
+    predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+  for row in gf:
+    (rid, pred) = row.split('\t') 
+    if rid in predictionMap: # Only insert if the id exists for the method
+      if rid not in goldMap:
+        goldMap[rid] = []
+      goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+  sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
+  return (goldMap, predictionMap)
+
+
+#m1 is the reference map
+#m2 is the prediction map
+def bleuFromMaps(m1, m2):
+  score = [0] * 5
+  num = 0.0
+
+  for key in m1:
+    if key in m2:
+      bl = bleu(m1[key], m2[key][0])
+      score = [ score[i] + bl[i] for i in range(0, len(bl))]
+      num += 1
+  return [s * 100.0 / num for s in score]
+
+
+def smoothed_bleu_4(references, predictions, **kwargs):
+
+    predictionMap = {}
+    goldMap = {}
+
+    for rid, pred in enumerate(predictions):
+      predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for rid, row in enumerate(references):
+      goldMap[rid] = [splitPuncts(row.strip().lower())]
+
+    return bleuFromMaps(goldMap, predictionMap)[0]
+
+if __name__ == '__main__':
+  reference_file = sys.argv[1]
+  predictions = []
+  for row in sys.stdin:
+    predictions.append(row)
+  (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
+  print (bleuFromMaps(goldMap, predictionMap)[0])
diff --git a/lm_eval/tasks/code_x_glue/code-text/go.yaml b/lm_eval/tasks/code_x_glue/code-text/go.yaml
new file mode 100644
index 00000000..3a4033c6
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_go
+dataset_path: CM/codexglue_code2text_go
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/java.yaml b/lm_eval/tasks/code_x_glue/code-text/java.yaml
new file mode 100644
index 00000000..141673c9
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_java
+dataset_path: CM/codexglue_code2text_java
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
new file mode 100644
index 00000000..c537e50d
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_javascript
+dataset_path: CM/codexglue_code2text_javascript
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/php.yaml b/lm_eval/tasks/code_x_glue/code-text/php.yaml
new file mode 100644
index 00000000..9137bdaf
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_php
+dataset_path: CM/codexglue_code2text_php
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/python.yaml b/lm_eval/tasks/code_x_glue/code-text/python.yaml
new file mode 100644
index 00000000..a98bfdba
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_python
+dataset_path: CM/codexglue_code2text_python
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
new file mode 100644
index 00000000..d6562d4c
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -0,0 +1,19 @@
+group:
+  - codexglue_code2text
+task: code2text_ruby
+dataset_path: CM/codexglue_code2text_ruby
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/code_x_glue/code-text/utils.py b/lm_eval/tasks/code_x_glue/code-text/utils.py
new file mode 100644
index 00000000..89cbbdf3
--- /dev/null
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
@@ -0,0 +1,14 @@
+
+def doc_to_text(doc):
+
+    inputs = ' '.join(doc['code_tokens']).replace('\n',' ')
+    inputs = ' '.join(inputs.strip().split())
+
+    return inputs
+
+def doc_to_target(doc):
+
+    targets = ' '.join(doc['docstring_tokens']).replace('\n','')
+    targets = ' '.join(targets.strip().split())     
+
+    return targets
\ No newline at end of file
-- 
GitLab


From 7601d82827abfeb60288e1c561e8ac518a5b4d0d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 5 Sep 2023 14:11:24 +0000
Subject: [PATCH 038/212] edit to fix cot filter

---
 .../tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml  | 2 +-
 .../bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index 34d7f066..680c2533 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -18,5 +18,5 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+        regex_pattern: "(?<=the answer is )(.*)(?=.)"
       - function: "take_first"
\ No newline at end of file
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index bda6eb96..66ab12e3 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -18,5 +18,5 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+        regex_pattern: "(?<=the answer is )(.*)(?=.)"
       - function: "take_first"
\ No newline at end of file
-- 
GitLab


From 605787a9658e1acb5ff3f01900723c5bd360ca7b Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 13 Sep 2023 14:31:10 +0000
Subject: [PATCH 039/212] add bigbench

---
 lm_eval/tasks/bigbench/README.md              |  49 +++++
 lm_eval/tasks/bigbench/generate_tasks.py      | 198 ++++++++++++++++++
 .../abstract_narrative_understanding.yaml     |   4 +
 .../bigbench/greedy_until/anachronisms.yaml   |   4 +
 .../greedy_until/analogical_similarity.yaml   |   4 +
 .../greedy_until/analytic_entailment.yaml     |   4 +
 .../bigbench/greedy_until/arithmetic.yaml     |   4 +
 .../greedy_until/ascii_word_recognition.yaml  |   4 +
 .../greedy_until/authorship_verification.yaml |   4 +
 .../greedy_until/auto_categorization.yaml     |   4 +
 .../bigbench/greedy_until/auto_debugging.yaml |   4 +
 .../bigbench/greedy_until/bbq_lite_json.yaml  |   4 +
 .../bridging_anaphora_resolution_barqa.yaml   |   4 +
 .../greedy_until/causal_judgment.yaml         |   4 +
 .../greedy_until/cause_and_effect.yaml        |   4 +
 .../greedy_until/checkmate_in_one.yaml        |   4 +
 .../greedy_until/chess_state_tracking.yaml    |   4 +
 .../chinese_remainder_theorem.yaml            |   4 +
 .../greedy_until/cifar10_classification.yaml  |   4 +
 .../greedy_until/code_line_description.yaml   |   4 +
 .../bigbench/greedy_until/codenames.yaml      |   4 +
 .../tasks/bigbench/greedy_until/color.yaml    |   4 +
 .../greedy_until/common_morpheme.yaml         |   4 +
 .../greedy_until/conceptual_combinations.yaml |   4 +
 .../greedy_until/conlang_translation.yaml     |   4 +
 ...extual_parametric_knowledge_conflicts.yaml |   4 +
 .../bigbench/greedy_until/crash_blossom.yaml  |   4 +
 .../tasks/bigbench/greedy_until/crass_ai.yaml |   4 +
 .../greedy_until/cryobiology_spanish.yaml     |   4 +
 .../bigbench/greedy_until/cryptonite.yaml     |   4 +
 .../bigbench/greedy_until/cs_algorithms.yaml  |   4 +
 .../greedy_until/dark_humor_detection.yaml    |   4 +
 .../greedy_until/date_understanding.yaml      |   4 +
 .../greedy_until/disambiguation_qa.yaml       |   4 +
 .../discourse_marker_prediction.yaml          |   4 +
 .../tasks/bigbench/greedy_until/disfl_qa.yaml |   4 +
 .../bigbench/greedy_until/dyck_languages.yaml |   4 +
 .../greedy_until/elementary_math_qa.yaml      |   4 +
 .../bigbench/greedy_until/emoji_movie.yaml    |   4 +
 .../emojis_emotion_prediction.yaml            |   4 +
 .../greedy_until/empirical_judgments.yaml     |   4 +
 .../greedy_until/english_proverbs.yaml        |   4 +
 .../english_russian_proverbs.yaml             |   4 +
 .../greedy_until/entailed_polarity.yaml       |   4 +
 .../greedy_until/entailed_polarity_hindi.yaml |   4 +
 .../greedy_until/epistemic_reasoning.yaml     |   4 +
 .../evaluating_information_essentiality.yaml  |   4 +
 .../bigbench/greedy_until/fact_checker.yaml   |   4 +
 .../greedy_until/fantasy_reasoning.yaml       |   4 +
 .../bigbench/greedy_until/few_shot_nlg.yaml   |   4 +
 .../figure_of_speech_detection.yaml           |   4 +
 .../formal_fallacies_syllogisms_negation.yaml |   4 +
 lm_eval/tasks/bigbench/greedy_until/gem.yaml  |   4 +
 .../gender_inclusive_sentences_german.yaml    |   4 +
 .../greedy_until/general_knowledge.yaml       |   4 +
 .../greedy_until/geometric_shapes.yaml        |   4 +
 .../greedy_until/goal_step_wikihow.yaml       |   4 +
 .../gre_reading_comprehension.yaml            |   4 +
 .../bigbench/greedy_until/hhh_alignment.yaml  |   4 +
 .../hindi_question_answering.yaml             |   4 +
 .../greedy_until/hindu_knowledge.yaml         |   4 +
 .../greedy_until/hinglish_toxicity.yaml       |   4 +
 .../greedy_until/human_organs_senses.yaml     |   4 +
 .../bigbench/greedy_until/hyperbaton.yaml     |   4 +
 .../greedy_until/identify_math_theorems.yaml  |   4 +
 .../greedy_until/identify_odd_metaphor.yaml   |   4 +
 .../bigbench/greedy_until/implicatures.yaml   |   4 +
 .../greedy_until/implicit_relations.yaml      |   4 +
 .../greedy_until/intent_recognition.yaml      |   4 +
 .../international_phonetic_alphabet_nli.yaml  |   4 +
 ...ional_phonetic_alphabet_transliterate.yaml |   4 +
 .../greedy_until/intersect_geometry.yaml      |   4 +
 .../greedy_until/irony_identification.yaml    |   4 +
 .../bigbench/greedy_until/kanji_ascii.yaml    |   4 +
 .../tasks/bigbench/greedy_until/kannada.yaml  |   4 +
 .../bigbench/greedy_until/key_value_maps.yaml |   4 +
 .../bigbench/greedy_until/known_unknowns.yaml |   4 +
 .../bigbench/greedy_until/language_games.yaml |   4 +
 .../greedy_until/language_identification.yaml |   4 +
 .../greedy_until/linguistic_mappings.yaml     |   4 +
 .../greedy_until/linguistics_puzzles.yaml     |   4 +
 .../bigbench/greedy_until/list_functions.yaml |   4 +
 .../greedy_until/logic_grid_puzzle.yaml       |   4 +
 .../bigbench/greedy_until/logical_args.yaml   |   4 +
 .../greedy_until/logical_deduction.yaml       |   4 +
 .../logical_fallacy_detection.yaml            |   4 +
 .../greedy_until/logical_sequence.yaml        |   4 +
 .../greedy_until/mathematical_induction.yaml  |   4 +
 .../bigbench/greedy_until/matrixshapes.yaml   |   4 +
 .../greedy_until/metaphor_boolean.yaml        |   4 +
 .../greedy_until/metaphor_understanding.yaml  |   4 +
 .../greedy_until/minute_mysteries_qa.yaml     |   4 +
 .../bigbench/greedy_until/misconceptions.yaml |   4 +
 .../greedy_until/misconceptions_russian.yaml  |   4 +
 .../bigbench/greedy_until/mnist_ascii.yaml    |   4 +
 .../greedy_until/modified_arithmetic.yaml     |   4 +
 .../greedy_until/moral_permissibility.yaml    |   4 +
 .../movie_dialog_same_or_different.yaml       |   4 +
 .../greedy_until/movie_recommendation.yaml    |   4 +
 .../greedy_until/mult_data_wrangling.yaml     |   4 +
 .../tasks/bigbench/greedy_until/multiemo.yaml |   4 +
 .../greedy_until/natural_instructions.yaml    |   4 +
 .../tasks/bigbench/greedy_until/navigate.yaml |   4 +
 .../greedy_until/nonsense_words_grammar.yaml  |   4 +
 .../bigbench/greedy_until/novel_concepts.yaml |   4 +
 .../greedy_until/object_counting.yaml         |   4 +
 .../bigbench/greedy_until/odd_one_out.yaml    |   4 +
 .../bigbench/greedy_until/operators.yaml      |   4 +
 .../greedy_until/paragraph_segmentation.yaml  |   4 +
 .../bigbench/greedy_until/parsinlu_qa.yaml    |   4 +
 .../parsinlu_reading_comprehension.yaml       |   4 +
 .../greedy_until/penguins_in_a_table.yaml     |   4 +
 .../greedy_until/periodic_elements.yaml       |   4 +
 .../bigbench/greedy_until/persian_idioms.yaml |   4 +
 .../greedy_until/phrase_relatedness.yaml      |   4 +
 .../greedy_until/physical_intuition.yaml      |   4 +
 .../tasks/bigbench/greedy_until/physics.yaml  |   4 +
 .../greedy_until/physics_questions.yaml       |   4 +
 .../play_dialog_same_or_different.yaml        |   4 +
 .../polish_sequence_labeling.yaml             |   4 +
 .../greedy_until/presuppositions_as_nli.yaml  |   4 +
 .../bigbench/greedy_until/qa_wikidata.yaml    |   4 +
 .../greedy_until/question_selection.yaml      |   4 +
 .../greedy_until/real_or_fake_text.yaml       |   4 +
 .../reasoning_about_colored_objects.yaml      |   4 +
 .../greedy_until/repeat_copy_logic.yaml       |   4 +
 .../tasks/bigbench/greedy_until/rephrase.yaml |   4 +
 .../bigbench/greedy_until/riddle_sense.yaml   |   4 +
 .../bigbench/greedy_until/ruin_names.yaml     |   4 +
 .../salient_translation_error_detection.yaml  |   4 +
 .../scientific_press_release.yaml             |   4 +
 .../semantic_parsing_in_context_sparc.yaml    |   4 +
 .../greedy_until/semantic_parsing_spider.yaml |   4 +
 .../greedy_until/sentence_ambiguity.yaml      |   4 +
 .../similarities_abstraction.yaml             |   4 +
 .../greedy_until/simp_turing_concept.yaml     |   4 +
 .../greedy_until/simple_arithmetic_json.yaml  |   4 +
 ...imple_arithmetic_json_multiple_choice.yaml |   4 +
 .../simple_arithmetic_json_subtasks.yaml      |   4 +
 ...mple_arithmetic_multiple_targets_json.yaml |   4 +
 .../simple_ethical_questions.yaml             |   4 +
 .../greedy_until/simple_text_editing.yaml     |   4 +
 .../tasks/bigbench/greedy_until/snarks.yaml   |   4 +
 .../bigbench/greedy_until/social_iqa.yaml     |   4 +
 .../bigbench/greedy_until/social_support.yaml |   4 +
 .../greedy_until/sports_understanding.yaml    |   4 +
 .../greedy_until/strange_stories.yaml         |   4 +
 .../bigbench/greedy_until/strategyqa.yaml     |   4 +
 .../greedy_until/sufficient_information.yaml  |   4 +
 .../bigbench/greedy_until/suicide_risk.yaml   |   4 +
 .../swahili_english_proverbs.yaml             |   4 +
 .../swedish_to_german_proverbs.yaml           |   4 +
 .../greedy_until/symbol_interpretation.yaml   |   4 +
 .../greedy_until/temporal_sequences.yaml      |   4 +
 .../tasks/bigbench/greedy_until/tense.yaml    |   4 +
 .../tasks/bigbench/greedy_until/timedial.yaml |   4 +
 .../bigbench/greedy_until/topical_chat.yaml   |   4 +
 .../tracking_shuffled_objects.yaml            |   4 +
 .../greedy_until/understanding_fables.yaml    |   4 +
 .../greedy_until/undo_permutation.yaml        |   4 +
 .../greedy_until/unit_conversion.yaml         |   4 +
 .../greedy_until/unit_interpretation.yaml     |   4 +
 .../unnatural_in_context_learning.yaml        |   4 +
 .../vitaminc_fact_verification.yaml           |   4 +
 .../greedy_until/what_is_the_tao.yaml         |   4 +
 .../greedy_until/which_wiki_edit.yaml         |   4 +
 .../tasks/bigbench/greedy_until/winowhy.yaml  |   4 +
 .../bigbench/greedy_until/word_sorting.yaml   |   4 +
 .../greedy_until/word_unscrambling.yaml       |   4 +
 .../tasks/bigbench/greedy_until_template_yaml |  14 ++
 .../abstract_narrative_understanding.yaml     |   4 +
 .../multiple_choice/anachronisms.yaml         |   4 +
 .../analogical_similarity.yaml                |   4 +
 .../multiple_choice/analytic_entailment.yaml  |   4 +
 .../bigbench/multiple_choice/arithmetic.yaml  |   4 +
 .../ascii_word_recognition.yaml               |   4 +
 .../authorship_verification.yaml              |   4 +
 .../multiple_choice/auto_categorization.yaml  |   4 +
 .../multiple_choice/auto_debugging.yaml       |   4 +
 .../multiple_choice/bbq_lite_json.yaml        |   4 +
 .../bridging_anaphora_resolution_barqa.yaml   |   4 +
 .../multiple_choice/causal_judgment.yaml      |   4 +
 .../multiple_choice/cause_and_effect.yaml     |   4 +
 .../multiple_choice/checkmate_in_one.yaml     |   4 +
 .../multiple_choice/chess_state_tracking.yaml |   4 +
 .../chinese_remainder_theorem.yaml            |   4 +
 .../cifar10_classification.yaml               |   4 +
 .../code_line_description.yaml                |   4 +
 .../bigbench/multiple_choice/codenames.yaml   |   4 +
 .../tasks/bigbench/multiple_choice/color.yaml |   4 +
 .../multiple_choice/common_morpheme.yaml      |   4 +
 .../conceptual_combinations.yaml              |   4 +
 .../multiple_choice/conlang_translation.yaml  |   4 +
 ...extual_parametric_knowledge_conflicts.yaml |   4 +
 .../multiple_choice/crash_blossom.yaml        |   4 +
 .../bigbench/multiple_choice/crass_ai.yaml    |   4 +
 .../multiple_choice/cryobiology_spanish.yaml  |   4 +
 .../bigbench/multiple_choice/cryptonite.yaml  |   4 +
 .../multiple_choice/cs_algorithms.yaml        |   4 +
 .../multiple_choice/dark_humor_detection.yaml |   4 +
 .../multiple_choice/date_understanding.yaml   |   4 +
 .../multiple_choice/disambiguation_qa.yaml    |   4 +
 .../discourse_marker_prediction.yaml          |   4 +
 .../bigbench/multiple_choice/disfl_qa.yaml    |   4 +
 .../multiple_choice/dyck_languages.yaml       |   4 +
 .../multiple_choice/elementary_math_qa.yaml   |   4 +
 .../bigbench/multiple_choice/emoji_movie.yaml |   4 +
 .../emojis_emotion_prediction.yaml            |   4 +
 .../multiple_choice/empirical_judgments.yaml  |   4 +
 .../multiple_choice/english_proverbs.yaml     |   4 +
 .../english_russian_proverbs.yaml             |   4 +
 .../multiple_choice/entailed_polarity.yaml    |   4 +
 .../entailed_polarity_hindi.yaml              |   4 +
 .../multiple_choice/epistemic_reasoning.yaml  |   4 +
 .../evaluating_information_essentiality.yaml  |   4 +
 .../multiple_choice/fact_checker.yaml         |   4 +
 .../multiple_choice/fantasy_reasoning.yaml    |   4 +
 .../multiple_choice/few_shot_nlg.yaml         |   4 +
 .../figure_of_speech_detection.yaml           |   4 +
 .../formal_fallacies_syllogisms_negation.yaml |   4 +
 .../tasks/bigbench/multiple_choice/gem.yaml   |   4 +
 .../gender_inclusive_sentences_german.yaml    |   4 +
 .../multiple_choice/general_knowledge.yaml    |   4 +
 .../multiple_choice/geometric_shapes.yaml     |   4 +
 .../multiple_choice/goal_step_wikihow.yaml    |   4 +
 .../gre_reading_comprehension.yaml            |   4 +
 .../multiple_choice/hhh_alignment.yaml        |   4 +
 .../hindi_question_answering.yaml             |   4 +
 .../multiple_choice/hindu_knowledge.yaml      |   4 +
 .../multiple_choice/hinglish_toxicity.yaml    |   4 +
 .../multiple_choice/human_organs_senses.yaml  |   4 +
 .../bigbench/multiple_choice/hyperbaton.yaml  |   4 +
 .../identify_math_theorems.yaml               |   4 +
 .../identify_odd_metaphor.yaml                |   4 +
 .../multiple_choice/implicatures.yaml         |   4 +
 .../multiple_choice/implicit_relations.yaml   |   4 +
 .../multiple_choice/intent_recognition.yaml   |   4 +
 .../international_phonetic_alphabet_nli.yaml  |   4 +
 ...ional_phonetic_alphabet_transliterate.yaml |   4 +
 .../multiple_choice/intersect_geometry.yaml   |   4 +
 .../multiple_choice/irony_identification.yaml |   4 +
 .../bigbench/multiple_choice/kanji_ascii.yaml |   4 +
 .../bigbench/multiple_choice/kannada.yaml     |   4 +
 .../multiple_choice/key_value_maps.yaml       |   4 +
 .../multiple_choice/known_unknowns.yaml       |   4 +
 .../multiple_choice/language_games.yaml       |   4 +
 .../language_identification.yaml              |   4 +
 .../multiple_choice/linguistic_mappings.yaml  |   4 +
 .../multiple_choice/linguistics_puzzles.yaml  |   4 +
 .../multiple_choice/list_functions.yaml       |   4 +
 .../multiple_choice/logic_grid_puzzle.yaml    |   4 +
 .../multiple_choice/logical_args.yaml         |   4 +
 .../multiple_choice/logical_deduction.yaml    |   4 +
 .../logical_fallacy_detection.yaml            |   4 +
 .../multiple_choice/logical_sequence.yaml     |   4 +
 .../mathematical_induction.yaml               |   4 +
 .../multiple_choice/matrixshapes.yaml         |   4 +
 .../multiple_choice/metaphor_boolean.yaml     |   4 +
 .../metaphor_understanding.yaml               |   4 +
 .../multiple_choice/minute_mysteries_qa.yaml  |   4 +
 .../multiple_choice/misconceptions.yaml       |   4 +
 .../misconceptions_russian.yaml               |   4 +
 .../bigbench/multiple_choice/mnist_ascii.yaml |   4 +
 .../multiple_choice/modified_arithmetic.yaml  |   4 +
 .../multiple_choice/moral_permissibility.yaml |   4 +
 .../movie_dialog_same_or_different.yaml       |   4 +
 .../multiple_choice/movie_recommendation.yaml |   4 +
 .../multiple_choice/mult_data_wrangling.yaml  |   4 +
 .../bigbench/multiple_choice/multiemo.yaml    |   4 +
 .../multiple_choice/natural_instructions.yaml |   4 +
 .../bigbench/multiple_choice/navigate.yaml    |   4 +
 .../nonsense_words_grammar.yaml               |   4 +
 .../multiple_choice/novel_concepts.yaml       |   4 +
 .../multiple_choice/object_counting.yaml      |   4 +
 .../bigbench/multiple_choice/odd_one_out.yaml |   4 +
 .../bigbench/multiple_choice/operators.yaml   |   4 +
 .../paragraph_segmentation.yaml               |   4 +
 .../bigbench/multiple_choice/parsinlu_qa.yaml |   4 +
 .../parsinlu_reading_comprehension.yaml       |   4 +
 .../multiple_choice/penguins_in_a_table.yaml  |   4 +
 .../multiple_choice/periodic_elements.yaml    |   4 +
 .../multiple_choice/persian_idioms.yaml       |   4 +
 .../multiple_choice/phrase_relatedness.yaml   |   4 +
 .../multiple_choice/physical_intuition.yaml   |   4 +
 .../bigbench/multiple_choice/physics.yaml     |   4 +
 .../multiple_choice/physics_questions.yaml    |   4 +
 .../play_dialog_same_or_different.yaml        |   4 +
 .../polish_sequence_labeling.yaml             |   4 +
 .../presuppositions_as_nli.yaml               |   4 +
 .../bigbench/multiple_choice/qa_wikidata.yaml |   4 +
 .../multiple_choice/question_selection.yaml   |   4 +
 .../multiple_choice/real_or_fake_text.yaml    |   4 +
 .../reasoning_about_colored_objects.yaml      |   4 +
 .../multiple_choice/repeat_copy_logic.yaml    |   4 +
 .../bigbench/multiple_choice/rephrase.yaml    |   4 +
 .../multiple_choice/riddle_sense.yaml         |   4 +
 .../bigbench/multiple_choice/ruin_names.yaml  |   4 +
 .../salient_translation_error_detection.yaml  |   4 +
 .../scientific_press_release.yaml             |   4 +
 .../semantic_parsing_in_context_sparc.yaml    |   4 +
 .../semantic_parsing_spider.yaml              |   4 +
 .../multiple_choice/sentence_ambiguity.yaml   |   4 +
 .../similarities_abstraction.yaml             |   4 +
 .../multiple_choice/simp_turing_concept.yaml  |   4 +
 .../simple_arithmetic_json.yaml               |   4 +
 ...imple_arithmetic_json_multiple_choice.yaml |   4 +
 .../simple_arithmetic_json_subtasks.yaml      |   4 +
 ...mple_arithmetic_multiple_targets_json.yaml |   4 +
 .../simple_ethical_questions.yaml             |   4 +
 .../multiple_choice/simple_text_editing.yaml  |   4 +
 .../bigbench/multiple_choice/snarks.yaml      |   4 +
 .../bigbench/multiple_choice/social_iqa.yaml  |   4 +
 .../multiple_choice/social_support.yaml       |   4 +
 .../multiple_choice/sports_understanding.yaml |   4 +
 .../multiple_choice/strange_stories.yaml      |   4 +
 .../bigbench/multiple_choice/strategyqa.yaml  |   4 +
 .../sufficient_information.yaml               |   4 +
 .../multiple_choice/suicide_risk.yaml         |   4 +
 .../swahili_english_proverbs.yaml             |   4 +
 .../swedish_to_german_proverbs.yaml           |   4 +
 .../symbol_interpretation.yaml                |   4 +
 .../multiple_choice/temporal_sequences.yaml   |   4 +
 .../tasks/bigbench/multiple_choice/tense.yaml |   4 +
 .../bigbench/multiple_choice/timedial.yaml    |   4 +
 .../multiple_choice/topical_chat.yaml         |   4 +
 .../tracking_shuffled_objects.yaml            |   4 +
 .../multiple_choice/understanding_fables.yaml |   4 +
 .../multiple_choice/undo_permutation.yaml     |   4 +
 .../multiple_choice/unit_conversion.yaml      |   4 +
 .../multiple_choice/unit_interpretation.yaml  |   4 +
 .../unnatural_in_context_learning.yaml        |   4 +
 .../vitaminc_fact_verification.yaml           |   4 +
 .../multiple_choice/what_is_the_tao.yaml      |   4 +
 .../multiple_choice/which_wiki_edit.yaml      |   4 +
 .../bigbench/multiple_choice/winowhy.yaml     |   4 +
 .../multiple_choice/word_sorting.yaml         |   4 +
 .../multiple_choice/word_unscrambling.yaml    |   4 +
 .../bigbench/multiple_choice_template_yaml    |  10 +
 338 files changed, 1607 insertions(+)
 create mode 100644 lm_eval/tasks/bigbench/README.md
 create mode 100644 lm_eval/tasks/bigbench/generate_tasks.py
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/codenames.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/color.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/gem.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/kannada.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/language_games.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/navigate.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/operators.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/physics.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/snarks.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/social_support.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/tense.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/timedial.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
 create mode 100644 lm_eval/tasks/bigbench/greedy_until_template_yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/color.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/gem.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/operators.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/physics.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/tense.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice_template_yaml

diff --git a/lm_eval/tasks/bigbench/README.md b/lm_eval/tasks/bigbench/README.md
new file mode 100644
index 00000000..bfb7d457
--- /dev/null
+++ b/lm_eval/tasks/bigbench/README.md
@@ -0,0 +1,49 @@
+# BigBench
+
+### Paper
+
+Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models`
+
+Abstract: https://arxiv.org/abs/2206.04615
+
+The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities. 
+
+Homepage: https://github.com/google/BIG-bench
+
+
+### Citation
+
+```
+@misc{srivastava2022imitation,
+      title={Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
+      author={Aarohi Srivastava and Abhinav Rastogi and Abhishek Rao and Abu Awal Md Shoeb and Abubakar Abid and Adam Fisch and Adam R. Brown and Adam Santoro and Aditya Gupta and Adrià Garriga-Alonso and Agnieszka Kluska and Aitor Lewkowycz and Akshat Agarwal and Alethea Power and Alex Ray and Alex Warstadt and Alexander W. Kocurek and Ali Safaya and Ali Tazarv and Alice Xiang and Alicia Parrish and Allen Nie and Aman Hussain and Amanda Askell and Amanda Dsouza and Ambrose Slone and Ameet Rahane and Anantharaman S. Iyer and Anders Andreassen and Andrea Madotto and Andrea Santilli and Andreas Stuhlmüller and Andrew Dai and Andrew La and Andrew Lampinen and Andy Zou and Angela Jiang and Angelica Chen and Anh Vuong and Animesh Gupta and Anna Gottardi and Antonio Norelli and Anu Venkatesh and Arash Gholamidavoodi and Arfa Tabassum and Arul Menezes and Arun Kirubarajan and Asher Mullokandov and Ashish Sabharwal and Austin Herrick and Avia Efrat and Aykut Erdem and Ayla Karakaş and B. Ryan Roberts and Bao Sheng Loe and Barret Zoph and Bartłomiej Bojanowski and Batuhan Özyurt and Behnam Hedayatnia and Behnam Neyshabur and Benjamin Inden and Benno Stein and Berk Ekmekci and Bill Yuchen Lin and Blake Howald and Cameron Diao and Cameron Dour and Catherine Stinson and Cedrick Argueta and César Ferri Ramírez and Chandan Singh and Charles Rathkopf and Chenlin Meng and Chitta Baral and Chiyu Wu and Chris Callison-Burch and Chris Waites and Christian Voigt and Christopher D. Manning and Christopher Potts and Cindy Ramirez and Clara E. Rivera and Clemencia Siro and Colin Raffel and Courtney Ashcraft and Cristina Garbacea and Damien Sileo and Dan Garrette and Dan Hendrycks and Dan Kilman and Dan Roth and Daniel Freeman and Daniel Khashabi and Daniel Levy and Daniel Moseguí González and Danielle Perszyk and Danny Hernandez and Danqi Chen and Daphne Ippolito and Dar Gilboa and David Dohan and David Drakard and David Jurgens and Debajyoti Datta and Deep Ganguli and Denis Emelin and Denis Kleyko and Deniz Yuret and Derek Chen and Derek Tam and Dieuwke Hupkes and Diganta Misra and Dilyar Buzan and Dimitri Coelho Mollo and Diyi Yang and Dong-Ho Lee and Ekaterina Shutova and Ekin Dogus Cubuk and Elad Segal and Eleanor Hagerman and Elizabeth Barnes and Elizabeth Donoway and Ellie Pavlick and Emanuele Rodola and Emma Lam and Eric Chu and Eric Tang and Erkut Erdem and Ernie Chang and Ethan A. Chi and Ethan Dyer and Ethan Jerzak and Ethan Kim and Eunice Engefu Manyasi and Evgenii Zheltonozhskii and Fanyue Xia and Fatemeh Siar and Fernando Martínez-Plumed and Francesca Happé and Francois Chollet and Frieda Rong and Gaurav Mishra and Genta Indra Winata and Gerard de Melo and Germán Kruszewski and Giambattista Parascandolo and Giorgio Mariani and Gloria Wang and Gonzalo Jaimovitch-López and Gregor Betz and Guy Gur-Ari and Hana Galijasevic and Hannah Kim and Hannah Rashkin and Hannaneh Hajishirzi and Harsh Mehta and Hayden Bogar and Henry Shevlin and Hinrich Schütze and Hiromu Yakura and Hongming Zhang and Hugh Mee Wong and Ian Ng and Isaac Noble and Jaap Jumelet and Jack Geissinger and Jackson Kernion and Jacob Hilton and Jaehoon Lee and Jaime Fernández Fisac and James B. Simon and James Koppel and James Zheng and James Zou and Jan Kocoń and Jana Thompson and Jared Kaplan and Jarema Radom and Jascha Sohl-Dickstein and Jason Phang and Jason Wei and Jason Yosinski and Jekaterina Novikova and Jelle Bosscher and Jennifer Marsh and Jeremy Kim and Jeroen Taal and Jesse Engel and Jesujoba Alabi and Jiacheng Xu and Jiaming Song and Jillian Tang and Joan Waweru and John Burden and John Miller and John U. Balis and Jonathan Berant and Jörg Frohberg and Jos Rozen and Jose Hernandez-Orallo and Joseph Boudeman and Joseph Jones and Joshua B. Tenenbaum and Joshua S. Rule and Joyce Chua and Kamil Kanclerz and Karen Livescu and Karl Krauth and Karthik Gopalakrishnan and Katerina Ignatyeva and Katja Markert and Kaustubh D. Dhole and Kevin Gimpel and Kevin Omondi and Kory Mathewson and Kristen Chiafullo and Ksenia Shkaruta and Kumar Shridhar and Kyle McDonell and Kyle Richardson and Laria Reynolds and Leo Gao and Li Zhang and Liam Dugan and Lianhui Qin and Lidia Contreras-Ochando and Louis-Philippe Morency and Luca Moschella and Lucas Lam and Lucy Noble and Ludwig Schmidt and Luheng He and Luis Oliveros Colón and Luke Metz and Lütfi Kerem Şenel and Maarten Bosma and Maarten Sap and Maartje ter Hoeve and Maheen Farooqi and Manaal Faruqui and Mantas Mazeika and Marco Baturan and Marco Marelli and Marco Maru and Maria Jose Ramírez Quintana and Marie Tolkiehn and Mario Giulianelli and Martha Lewis and Martin Potthast and Matthew L. Leavitt and Matthias Hagen and Mátyás Schubert and Medina Orduna Baitemirova and Melody Arnaud and Melvin McElrath and Michael A. Yee and Michael Cohen and Michael Gu and Michael Ivanitskiy and Michael Starritt and Michael Strube and Michał Swędrowski and Michele Bevilacqua and Michihiro Yasunaga and Mihir Kale and Mike Cain and Mimee Xu and Mirac Suzgun and Mo Tiwari and Mohit Bansal and Moin Aminnaseri and Mor Geva and Mozhdeh Gheini and Mukund Varma T and Nanyun Peng and Nathan Chi and Nayeon Lee and Neta Gur-Ari Krakover and Nicholas Cameron and Nicholas Roberts and Nick Doiron and Nikita Nangia and Niklas Deckers and Niklas Muennighoff and Nitish Shirish Keskar and Niveditha S. Iyer and Noah Constant and Noah Fiedel and Nuan Wen and Oliver Zhang and Omar Agha and Omar Elbaghdadi and Omer Levy and Owain Evans and Pablo Antonio Moreno Casares and Parth Doshi and Pascale Fung and Paul Pu Liang and Paul Vicol and Pegah Alipoormolabashi and Peiyuan Liao and Percy Liang and Peter Chang and Peter Eckersley and Phu Mon Htut and Pinyu Hwang and Piotr Miłkowski and Piyush Patil and Pouya Pezeshkpour and Priti Oli and Qiaozhu Mei and Qing Lyu and Qinlang Chen and Rabin Banjade and Rachel Etta Rudolph and Raefer Gabriel and Rahel Habacker and Ramón Risco Delgado and Raphaël Millière and Rhythm Garg and Richard Barnes and Rif A. Saurous and Riku Arakawa and Robbe Raymaekers and Robert Frank and Rohan Sikand and Roman Novak and Roman Sitelew and Ronan LeBras and Rosanne Liu and Rowan Jacobs and Rui Zhang and Ruslan Salakhutdinov and Ryan Chi and Ryan Lee and Ryan Stovall and Ryan Teehan and Rylan Yang and Sahib Singh and Saif M. Mohammad and Sajant Anand and Sam Dillavou and Sam Shleifer and Sam Wiseman and Samuel Gruetter and Samuel R. Bowman and Samuel S. Schoenholz and Sanghyun Han and Sanjeev Kwatra and Sarah A. Rous and Sarik Ghazarian and Sayan Ghosh and Sean Casey and Sebastian Bischoff and Sebastian Gehrmann and Sebastian Schuster and Sepideh Sadeghi and Shadi Hamdan and Sharon Zhou and Shashank Srivastava and Sherry Shi and Shikhar Singh and Shima Asaadi and Shixiang Shane Gu and Shubh Pachchigar and Shubham Toshniwal and Shyam Upadhyay and Shyamolima and Debnath and Siamak Shakeri and Simon Thormeyer and Simone Melzi and Siva Reddy and Sneha Priscilla Makini and Soo-Hwan Lee and Spencer Torene and Sriharsha Hatwar and Stanislas Dehaene and Stefan Divic and Stefano Ermon and Stella Biderman and Stephanie Lin and Stephen Prasad and Steven T. Piantadosi and Stuart M. Shieber and Summer Misherghi and Svetlana Kiritchenko and Swaroop Mishra and Tal Linzen and Tal Schuster and Tao Li and Tao Yu and Tariq Ali and Tatsu Hashimoto and Te-Lin Wu and Théo Desbordes and Theodore Rothschild and Thomas Phan and Tianle Wang and Tiberius Nkinyili and Timo Schick and Timofei Kornev and Timothy Telleen-Lawton and Titus Tunduny and Tobias Gerstenberg and Trenton Chang and Trishala Neeraj and Tushar Khot and Tyler Shultz and Uri Shaham and Vedant Misra and Vera Demberg and Victoria Nyamai and Vikas Raunak and Vinay Ramasesh and Vinay Uday Prabhu and Vishakh Padmakumar and Vivek Srikumar and William Fedus and William Saunders and William Zhang and Wout Vossen and Xiang Ren and Xiaoyu Tong and Xinran Zhao and Xinyi Wu and Xudong Shen and Yadollah Yaghoobzadeh and Yair Lakretz and Yangqiu Song and Yasaman Bahri and Yejin Choi and Yichi Yang and Yiding Hao and Yifu Chen and Yonatan Belinkov and Yu Hou and Yufang Hou and Yuntao Bai and Zachary Seid and Zhuoye Zhao and Zijian Wang and Zijie J. Wang and Zirui Wang and Ziyi Wu},
+      year={2022},
+      eprint={2206.04615},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `group_name`: `Short description`
+
+#### Tasks
+
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: ...
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
new file mode 100644
index 00000000..fa68190e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -0,0 +1,198 @@
+import os
+import yaml
+
+all_subtasks = [
+    'abstract_narrative_understanding',
+    'anachronisms',
+    'analogical_similarity',
+    'analytic_entailment',
+    'arithmetic',
+    'ascii_word_recognition',
+    'authorship_verification',
+    'auto_categorization',
+    'auto_debugging',
+    'bbq_lite_json',
+    'bridging_anaphora_resolution_barqa',
+    'causal_judgment',
+    'cause_and_effect',
+    'checkmate_in_one',
+    'chess_state_tracking',
+    'chinese_remainder_theorem',
+    'cifar10_classification',
+    'code_line_description',
+    'codenames',
+    'color',
+    'common_morpheme',
+    'conceptual_combinations',
+    'conlang_translation',
+    'contextual_parametric_knowledge_conflicts',
+    'crash_blossom',
+    'crass_ai',
+    'cryobiology_spanish',
+    'cryptonite',
+    'cs_algorithms',
+    'dark_humor_detection',
+    'date_understanding',
+    'disambiguation_qa',
+    'discourse_marker_prediction',
+    'disfl_qa',
+    'dyck_languages',
+    'elementary_math_qa',
+    'emoji_movie',
+    'emojis_emotion_prediction',
+    'empirical_judgments',
+    'english_proverbs',
+    'english_russian_proverbs',
+    'entailed_polarity',
+    'entailed_polarity_hindi',
+    'epistemic_reasoning',
+    'evaluating_information_essentiality',
+    'fact_checker',
+    'fantasy_reasoning',
+    'few_shot_nlg',
+    'figure_of_speech_detection',
+    'formal_fallacies_syllogisms_negation',
+    'gem',
+    'gender_inclusive_sentences_german',
+    'general_knowledge',
+    'geometric_shapes',
+    'goal_step_wikihow',
+    'gre_reading_comprehension',
+    'hhh_alignment',
+    'hindi_question_answering',
+    'hindu_knowledge',
+    'hinglish_toxicity',
+    'human_organs_senses',
+    'hyperbaton',
+    'identify_math_theorems',
+    'identify_odd_metaphor',
+    'implicatures',
+    'implicit_relations',
+    'intent_recognition',
+    'international_phonetic_alphabet_nli',
+    'international_phonetic_alphabet_transliterate',
+    'intersect_geometry',
+    'irony_identification',
+    'kanji_ascii',
+    'kannada',
+    'key_value_maps',
+    'known_unknowns',
+    'language_games',
+    'language_identification',
+    'linguistic_mappings',
+    'linguistics_puzzles',
+    'list_functions',
+    'logic_grid_puzzle',
+    'logical_args',
+    'logical_deduction',
+    'logical_fallacy_detection',
+    'logical_sequence',
+    'mathematical_induction',
+    'matrixshapes',
+    'metaphor_boolean',
+    'metaphor_understanding',
+    'minute_mysteries_qa',
+    'misconceptions',
+    'misconceptions_russian',
+    'mnist_ascii',
+    'modified_arithmetic',
+    'moral_permissibility',
+    'movie_dialog_same_or_different',
+    'movie_recommendation',
+    'mult_data_wrangling',
+    'multiemo',
+    'natural_instructions',
+    'navigate',
+    'nonsense_words_grammar',
+    'novel_concepts',
+    'object_counting',
+    'odd_one_out',
+    'operators',
+    'paragraph_segmentation',
+    'parsinlu_qa',
+    'parsinlu_reading_comprehension',
+    'penguins_in_a_table',
+    'periodic_elements',
+    'persian_idioms',
+    'phrase_relatedness',
+    'physical_intuition',
+    'physics',
+    'physics_questions',
+    'play_dialog_same_or_different',
+    'polish_sequence_labeling',
+    'presuppositions_as_nli',
+    'qa_wikidata',
+    'question_selection',
+    'real_or_fake_text',
+    'reasoning_about_colored_objects',
+    'repeat_copy_logic',
+    'rephrase',
+    'riddle_sense',
+    'ruin_names',
+    'salient_translation_error_detection',
+    'scientific_press_release',
+    'semantic_parsing_in_context_sparc',
+    'semantic_parsing_spider',
+    'sentence_ambiguity',
+    'similarities_abstraction',
+    'simp_turing_concept',
+    'simple_arithmetic_json',
+    'simple_arithmetic_json_multiple_choice',
+    'simple_arithmetic_json_subtasks',
+    'simple_arithmetic_multiple_targets_json',
+    'simple_ethical_questions',
+    'simple_text_editing',
+    'snarks',
+    'social_iqa',
+    'social_support',
+    'sports_understanding',
+    'strange_stories',
+    'strategyqa',
+    'sufficient_information',
+    'suicide_risk',
+    'swahili_english_proverbs',
+    'swedish_to_german_proverbs',
+    'symbol_interpretation',
+    'temporal_sequences',
+    'tense',
+    'timedial',
+    'topical_chat',
+    'tracking_shuffled_objects',
+    'understanding_fables',
+    'undo_permutation',
+    'unit_conversion',
+    'unit_interpretation',
+    'unnatural_in_context_learning',
+    'vitaminc_fact_verification',
+    'what_is_the_tao',
+    'which_wiki_edit',
+    'winowhy',
+    'word_sorting',
+    'word_unscrambling'
+    ]
+
+
+def main() -> None:
+
+    for path, task_type in zip(["multiple_choice", "greedy_until"], ["multiple_choice_template_yaml", "greedy_until_template_yaml"]):
+        os.makedirs(path, exist_ok=True)
+        for task in all_subtasks:
+            file_name = f"{task}.yaml"
+            try:
+                with open(f"{path}/{file_name}", "w") as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": f"../{task_type}",
+                            "task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]),
+                            "dataset_name": task,
+                        },
+                        f,
+                        width=float("inf"), allow_unicode=True
+                    )
+            except FileExistsError:
+                pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
new file mode 100644
index 00000000..462d1be9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding
+include: ../greedy_until_template_yaml
+task: bigbench_abstract_narrative_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
new file mode 100644
index 00000000..d62133a0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: anachronisms
+include: ../greedy_until_template_yaml
+task: bigbench_anachronisms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
new file mode 100644
index 00000000..2fedcd91
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analogical_similarity
+include: ../greedy_until_template_yaml
+task: bigbench_analogical_similarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
new file mode 100644
index 00000000..58de1bcf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analytic_entailment
+include: ../greedy_until_template_yaml
+task: bigbench_analytic_entailment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
new file mode 100644
index 00000000..6be6a787
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: arithmetic
+include: ../greedy_until_template_yaml
+task: bigbench_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
new file mode 100644
index 00000000..0461605a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ascii_word_recognition
+include: ../greedy_until_template_yaml
+task: bigbench_ascii_word_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
new file mode 100644
index 00000000..dbfa2103
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: authorship_verification
+include: ../greedy_until_template_yaml
+task: bigbench_authorship_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
new file mode 100644
index 00000000..9ab1545e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_categorization
+include: ../greedy_until_template_yaml
+task: bigbench_auto_categorization_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
new file mode 100644
index 00000000..e8a491c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_debugging
+include: ../greedy_until_template_yaml
+task: bigbench_auto_debugging_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
new file mode 100644
index 00000000..8b97ba0a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bbq_lite_json
+include: ../greedy_until_template_yaml
+task: bigbench_bbq_lite_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
new file mode 100644
index 00000000..618d8dd3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bridging_anaphora_resolution_barqa
+include: ../greedy_until_template_yaml
+task: bigbench_bridging_anaphora_resolution_barqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
new file mode 100644
index 00000000..687d59ba
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: causal_judgment
+include: ../greedy_until_template_yaml
+task: bigbench_causal_judgment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
new file mode 100644
index 00000000..a1f20264
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cause_and_effect
+include: ../greedy_until_template_yaml
+task: bigbench_cause_and_effect_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
new file mode 100644
index 00000000..4089a228
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: checkmate_in_one
+include: ../greedy_until_template_yaml
+task: bigbench_checkmate_in_one_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
new file mode 100644
index 00000000..727e7879
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chess_state_tracking
+include: ../greedy_until_template_yaml
+task: bigbench_chess_state_tracking_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
new file mode 100644
index 00000000..6af0bcbf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chinese_remainder_theorem
+include: ../greedy_until_template_yaml
+task: bigbench_chinese_remainder_theorem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
new file mode 100644
index 00000000..3e0bf92c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cifar10_classification
+include: ../greedy_until_template_yaml
+task: bigbench_cifar10_classification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
new file mode 100644
index 00000000..624ab362
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: code_line_description
+include: ../greedy_until_template_yaml
+task: bigbench_code_line_description_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
new file mode 100644
index 00000000..6ea8f12e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: codenames
+include: ../greedy_until_template_yaml
+task: bigbench_codenames_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/color.yaml b/lm_eval/tasks/bigbench/greedy_until/color.yaml
new file mode 100644
index 00000000..4ae393fd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/color.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: color
+include: ../greedy_until_template_yaml
+task: bigbench_color_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
new file mode 100644
index 00000000..90d183ad
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: common_morpheme
+include: ../greedy_until_template_yaml
+task: bigbench_common_morpheme_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
new file mode 100644
index 00000000..007649a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conceptual_combinations
+include: ../greedy_until_template_yaml
+task: bigbench_conceptual_combinations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
new file mode 100644
index 00000000..3b5bafac
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conlang_translation
+include: ../greedy_until_template_yaml
+task: bigbench_conlang_translation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
new file mode 100644
index 00000000..dc594b9b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: contextual_parametric_knowledge_conflicts
+include: ../greedy_until_template_yaml
+task: bigbench_contextual_parametric_knowledge_conflicts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
new file mode 100644
index 00000000..aca19b7b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crash_blossom
+include: ../greedy_until_template_yaml
+task: bigbench_crash_blossom_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
new file mode 100644
index 00000000..043e8f47
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crass_ai
+include: ../greedy_until_template_yaml
+task: bigbench_crass_ai_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
new file mode 100644
index 00000000..eb9c5b3b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryobiology_spanish
+include: ../greedy_until_template_yaml
+task: bigbench_cryobiology_spanish_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
new file mode 100644
index 00000000..15c181b2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryptonite
+include: ../greedy_until_template_yaml
+task: bigbench_cryptonite_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
new file mode 100644
index 00000000..477c2497
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cs_algorithms
+include: ../greedy_until_template_yaml
+task: bigbench_cs_algorithms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
new file mode 100644
index 00000000..0521848d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dark_humor_detection
+include: ../greedy_until_template_yaml
+task: bigbench_dark_humor_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
new file mode 100644
index 00000000..5936e98f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: date_understanding
+include: ../greedy_until_template_yaml
+task: bigbench_date_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
new file mode 100644
index 00000000..ffe5135a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disambiguation_qa
+include: ../greedy_until_template_yaml
+task: bigbench_disambiguation_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
new file mode 100644
index 00000000..aed41150
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: discourse_marker_prediction
+include: ../greedy_until_template_yaml
+task: bigbench_discourse_marker_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
new file mode 100644
index 00000000..fd087719
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disfl_qa
+include: ../greedy_until_template_yaml
+task: bigbench_disfl_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
new file mode 100644
index 00000000..af29b7f8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dyck_languages
+include: ../greedy_until_template_yaml
+task: bigbench_dyck_languages_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
new file mode 100644
index 00000000..ea1a61ba
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: elementary_math_qa
+include: ../greedy_until_template_yaml
+task: bigbench_elementary_math_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
new file mode 100644
index 00000000..a8368f62
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emoji_movie
+include: ../greedy_until_template_yaml
+task: bigbench_emoji_movie_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
new file mode 100644
index 00000000..f8392d66
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emojis_emotion_prediction
+include: ../greedy_until_template_yaml
+task: bigbench_emojis_emotion_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
new file mode 100644
index 00000000..97ea08c8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: empirical_judgments
+include: ../greedy_until_template_yaml
+task: bigbench_empirical_judgments_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
new file mode 100644
index 00000000..2eaa4a9b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_proverbs
+include: ../greedy_until_template_yaml
+task: bigbench_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
new file mode 100644
index 00000000..d0386e50
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_russian_proverbs
+include: ../greedy_until_template_yaml
+task: bigbench_english_russian_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
new file mode 100644
index 00000000..efb1f853
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity
+include: ../greedy_until_template_yaml
+task: bigbench_entailed_polarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
new file mode 100644
index 00000000..5922a065
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_hindi
+include: ../greedy_until_template_yaml
+task: bigbench_entailed_polarity_hindi_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
new file mode 100644
index 00000000..d6307592
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: epistemic_reasoning
+include: ../greedy_until_template_yaml
+task: bigbench_epistemic_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
new file mode 100644
index 00000000..13b6dd32
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: evaluating_information_essentiality
+include: ../greedy_until_template_yaml
+task: bigbench_evaluating_information_essentiality_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
new file mode 100644
index 00000000..6d3ccf9f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fact_checker
+include: ../greedy_until_template_yaml
+task: bigbench_fact_checker_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
new file mode 100644
index 00000000..16415a7b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fantasy_reasoning
+include: ../greedy_until_template_yaml
+task: bigbench_fantasy_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
new file mode 100644
index 00000000..229e1c70
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: few_shot_nlg
+include: ../greedy_until_template_yaml
+task: bigbench_few_shot_nlg_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
new file mode 100644
index 00000000..059f9f33
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: figure_of_speech_detection
+include: ../greedy_until_template_yaml
+task: bigbench_figure_of_speech_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
new file mode 100644
index 00000000..663a718c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: formal_fallacies_syllogisms_negation
+include: ../greedy_until_template_yaml
+task: bigbench_formal_fallacies_syllogisms_negation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gem.yaml b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
new file mode 100644
index 00000000..79492583
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gem
+include: ../greedy_until_template_yaml
+task: bigbench_gem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
new file mode 100644
index 00000000..10414179
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gender_inclusive_sentences_german
+include: ../greedy_until_template_yaml
+task: bigbench_gender_inclusive_sentences_german_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
new file mode 100644
index 00000000..b2a14656
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: general_knowledge
+include: ../greedy_until_template_yaml
+task: bigbench_general_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
new file mode 100644
index 00000000..4e256462
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: geometric_shapes
+include: ../greedy_until_template_yaml
+task: bigbench_geometric_shapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
new file mode 100644
index 00000000..d865e3d4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: goal_step_wikihow
+include: ../greedy_until_template_yaml
+task: bigbench_goal_step_wikihow_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
new file mode 100644
index 00000000..9f044835
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gre_reading_comprehension
+include: ../greedy_until_template_yaml
+task: bigbench_gre_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
new file mode 100644
index 00000000..1ab62b56
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hhh_alignment
+include: ../greedy_until_template_yaml
+task: bigbench_hhh_alignment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
new file mode 100644
index 00000000..3a0fa8b2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindi_question_answering
+include: ../greedy_until_template_yaml
+task: bigbench_hindi_question_answering_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
new file mode 100644
index 00000000..19162629
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindu_knowledge
+include: ../greedy_until_template_yaml
+task: bigbench_hindu_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
new file mode 100644
index 00000000..84073aa0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hinglish_toxicity
+include: ../greedy_until_template_yaml
+task: bigbench_hinglish_toxicity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
new file mode 100644
index 00000000..32fc0058
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: human_organs_senses
+include: ../greedy_until_template_yaml
+task: bigbench_human_organs_senses_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
new file mode 100644
index 00000000..d3a65a87
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hyperbaton
+include: ../greedy_until_template_yaml
+task: bigbench_hyperbaton_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
new file mode 100644
index 00000000..616085c8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_math_theorems
+include: ../greedy_until_template_yaml
+task: bigbench_identify_math_theorems_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
new file mode 100644
index 00000000..6500f7a9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_odd_metaphor
+include: ../greedy_until_template_yaml
+task: bigbench_identify_odd_metaphor_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
new file mode 100644
index 00000000..fdc133f5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicatures
+include: ../greedy_until_template_yaml
+task: bigbench_implicatures_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
new file mode 100644
index 00000000..b05af0ad
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicit_relations
+include: ../greedy_until_template_yaml
+task: bigbench_implicit_relations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
new file mode 100644
index 00000000..37769770
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intent_recognition
+include: ../greedy_until_template_yaml
+task: bigbench_intent_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
new file mode 100644
index 00000000..81b975c9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_nli
+include: ../greedy_until_template_yaml
+task: bigbench_international_phonetic_alphabet_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
new file mode 100644
index 00000000..ac664332
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_transliterate
+include: ../greedy_until_template_yaml
+task: bigbench_international_phonetic_alphabet_transliterate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
new file mode 100644
index 00000000..d08f1d6a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intersect_geometry
+include: ../greedy_until_template_yaml
+task: bigbench_intersect_geometry_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
new file mode 100644
index 00000000..d9d5961c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: irony_identification
+include: ../greedy_until_template_yaml
+task: bigbench_irony_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
new file mode 100644
index 00000000..b6a7470f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kanji_ascii
+include: ../greedy_until_template_yaml
+task: bigbench_kanji_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
new file mode 100644
index 00000000..50ad13c1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kannada
+include: ../greedy_until_template_yaml
+task: bigbench_kannada_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
new file mode 100644
index 00000000..6d5ad040
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: key_value_maps
+include: ../greedy_until_template_yaml
+task: bigbench_key_value_maps_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
new file mode 100644
index 00000000..c07e0e8c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: known_unknowns
+include: ../greedy_until_template_yaml
+task: bigbench_known_unknowns_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
new file mode 100644
index 00000000..392a7190
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_games
+include: ../greedy_until_template_yaml
+task: bigbench_language_games_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
new file mode 100644
index 00000000..583d9108
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_identification
+include: ../greedy_until_template_yaml
+task: bigbench_language_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
new file mode 100644
index 00000000..92a855a8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistic_mappings
+include: ../greedy_until_template_yaml
+task: bigbench_linguistic_mappings_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
new file mode 100644
index 00000000..7aec6607
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistics_puzzles
+include: ../greedy_until_template_yaml
+task: bigbench_linguistics_puzzles_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
new file mode 100644
index 00000000..f7f0d436
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: list_functions
+include: ../greedy_until_template_yaml
+task: bigbench_list_functions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
new file mode 100644
index 00000000..2699b12f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logic_grid_puzzle
+include: ../greedy_until_template_yaml
+task: bigbench_logic_grid_puzzle_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
new file mode 100644
index 00000000..9a263f96
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_args
+include: ../greedy_until_template_yaml
+task: bigbench_logical_args_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
new file mode 100644
index 00000000..5e72facb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_deduction
+include: ../greedy_until_template_yaml
+task: bigbench_logical_deduction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
new file mode 100644
index 00000000..a21fbc58
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_fallacy_detection
+include: ../greedy_until_template_yaml
+task: bigbench_logical_fallacy_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
new file mode 100644
index 00000000..f01ce277
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_sequence
+include: ../greedy_until_template_yaml
+task: bigbench_logical_sequence_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
new file mode 100644
index 00000000..d4b2fcf6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mathematical_induction
+include: ../greedy_until_template_yaml
+task: bigbench_mathematical_induction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
new file mode 100644
index 00000000..adf86ecc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: matrixshapes
+include: ../greedy_until_template_yaml
+task: bigbench_matrixshapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
new file mode 100644
index 00000000..94893b4d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_boolean
+include: ../greedy_until_template_yaml
+task: bigbench_metaphor_boolean_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
new file mode 100644
index 00000000..8ca4da75
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_understanding
+include: ../greedy_until_template_yaml
+task: bigbench_metaphor_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
new file mode 100644
index 00000000..b9db2b8f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: minute_mysteries_qa
+include: ../greedy_until_template_yaml
+task: bigbench_minute_mysteries_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
new file mode 100644
index 00000000..60c8221b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions
+include: ../greedy_until_template_yaml
+task: bigbench_misconceptions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
new file mode 100644
index 00000000..a1fca685
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_russian
+include: ../greedy_until_template_yaml
+task: bigbench_misconceptions_russian_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
new file mode 100644
index 00000000..b845caa3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mnist_ascii
+include: ../greedy_until_template_yaml
+task: bigbench_mnist_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
new file mode 100644
index 00000000..5dc888f6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: modified_arithmetic
+include: ../greedy_until_template_yaml
+task: bigbench_modified_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
new file mode 100644
index 00000000..a20c23be
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: moral_permissibility
+include: ../greedy_until_template_yaml
+task: bigbench_moral_permissibility_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
new file mode 100644
index 00000000..db57a939
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_dialog_same_or_different
+include: ../greedy_until_template_yaml
+task: bigbench_movie_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
new file mode 100644
index 00000000..00a0c1a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_recommendation
+include: ../greedy_until_template_yaml
+task: bigbench_movie_recommendation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
new file mode 100644
index 00000000..7a1003cf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mult_data_wrangling
+include: ../greedy_until_template_yaml
+task: bigbench_mult_data_wrangling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
new file mode 100644
index 00000000..df230d77
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: multiemo
+include: ../greedy_until_template_yaml
+task: bigbench_multiemo_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
new file mode 100644
index 00000000..cc800106
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: natural_instructions
+include: ../greedy_until_template_yaml
+task: bigbench_natural_instructions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
new file mode 100644
index 00000000..1e3004d5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: navigate
+include: ../greedy_until_template_yaml
+task: bigbench_navigate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
new file mode 100644
index 00000000..169b6743
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: nonsense_words_grammar
+include: ../greedy_until_template_yaml
+task: bigbench_nonsense_words_grammar_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
new file mode 100644
index 00000000..9618dce2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: novel_concepts
+include: ../greedy_until_template_yaml
+task: bigbench_novel_concepts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
new file mode 100644
index 00000000..7b058748
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: object_counting
+include: ../greedy_until_template_yaml
+task: bigbench_object_counting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
new file mode 100644
index 00000000..1742789e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: odd_one_out
+include: ../greedy_until_template_yaml
+task: bigbench_odd_one_out_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/operators.yaml b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
new file mode 100644
index 00000000..d71d87c2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: operators
+include: ../greedy_until_template_yaml
+task: bigbench_operators_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
new file mode 100644
index 00000000..13d8fb9d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: paragraph_segmentation
+include: ../greedy_until_template_yaml
+task: bigbench_paragraph_segmentation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
new file mode 100644
index 00000000..f8b78f8d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_qa
+include: ../greedy_until_template_yaml
+task: bigbench_parsinlu_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
new file mode 100644
index 00000000..4db292d0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_reading_comprehension
+include: ../greedy_until_template_yaml
+task: bigbench_parsinlu_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
new file mode 100644
index 00000000..a282fa64
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: penguins_in_a_table
+include: ../greedy_until_template_yaml
+task: bigbench_penguins_in_a_table_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
new file mode 100644
index 00000000..458a2e3d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: periodic_elements
+include: ../greedy_until_template_yaml
+task: bigbench_periodic_elements_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
new file mode 100644
index 00000000..e51eb69a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: persian_idioms
+include: ../greedy_until_template_yaml
+task: bigbench_persian_idioms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
new file mode 100644
index 00000000..3b03a67f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: phrase_relatedness
+include: ../greedy_until_template_yaml
+task: bigbench_phrase_relatedness_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
new file mode 100644
index 00000000..358b7db3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physical_intuition
+include: ../greedy_until_template_yaml
+task: bigbench_physical_intuition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics.yaml b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
new file mode 100644
index 00000000..d9d6f936
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics
+include: ../greedy_until_template_yaml
+task: bigbench_physics_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
new file mode 100644
index 00000000..6af11448
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_questions
+include: ../greedy_until_template_yaml
+task: bigbench_physics_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
new file mode 100644
index 00000000..600143e9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: play_dialog_same_or_different
+include: ../greedy_until_template_yaml
+task: bigbench_play_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
new file mode 100644
index 00000000..432820ad
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: polish_sequence_labeling
+include: ../greedy_until_template_yaml
+task: bigbench_polish_sequence_labeling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
new file mode 100644
index 00000000..c492b17f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: presuppositions_as_nli
+include: ../greedy_until_template_yaml
+task: bigbench_presuppositions_as_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
new file mode 100644
index 00000000..a23ea6e7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: qa_wikidata
+include: ../greedy_until_template_yaml
+task: bigbench_qa_wikidata_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
new file mode 100644
index 00000000..47953c14
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: question_selection
+include: ../greedy_until_template_yaml
+task: bigbench_question_selection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
new file mode 100644
index 00000000..e15af76e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: real_or_fake_text
+include: ../greedy_until_template_yaml
+task: bigbench_real_or_fake_text_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..b1aa5ec0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: reasoning_about_colored_objects
+include: ../greedy_until_template_yaml
+task: bigbench_reasoning_about_colored_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
new file mode 100644
index 00000000..12831cc7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: repeat_copy_logic
+include: ../greedy_until_template_yaml
+task: bigbench_repeat_copy_logic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
new file mode 100644
index 00000000..78c6bda7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: rephrase
+include: ../greedy_until_template_yaml
+task: bigbench_rephrase_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
new file mode 100644
index 00000000..e93b4aa9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: riddle_sense
+include: ../greedy_until_template_yaml
+task: bigbench_riddle_sense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
new file mode 100644
index 00000000..46039e1f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ruin_names
+include: ../greedy_until_template_yaml
+task: bigbench_ruin_names_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..a7e5c77e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: salient_translation_error_detection
+include: ../greedy_until_template_yaml
+task: bigbench_salient_translation_error_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
new file mode 100644
index 00000000..aa35e659
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: scientific_press_release
+include: ../greedy_until_template_yaml
+task: bigbench_scientific_press_release_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
new file mode 100644
index 00000000..184bfcb9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_in_context_sparc
+include: ../greedy_until_template_yaml
+task: bigbench_semantic_parsing_in_context_sparc_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
new file mode 100644
index 00000000..ae0b9461
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_spider
+include: ../greedy_until_template_yaml
+task: bigbench_semantic_parsing_spider_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
new file mode 100644
index 00000000..bb72ec88
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sentence_ambiguity
+include: ../greedy_until_template_yaml
+task: bigbench_sentence_ambiguity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
new file mode 100644
index 00000000..5c1ef27f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: similarities_abstraction
+include: ../greedy_until_template_yaml
+task: bigbench_similarities_abstraction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
new file mode 100644
index 00000000..742df0fb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simp_turing_concept
+include: ../greedy_until_template_yaml
+task: bigbench_simp_turing_concept_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
new file mode 100644
index 00000000..4e70a160
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json
+include: ../greedy_until_template_yaml
+task: bigbench_simple_arithmetic_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
new file mode 100644
index 00000000..5f6b6732
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_multiple_choice
+include: ../greedy_until_template_yaml
+task: bigbench_simple_arithmetic_json_multiple_choice_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
new file mode 100644
index 00000000..32c5fcfd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_subtasks
+include: ../greedy_until_template_yaml
+task: bigbench_simple_arithmetic_json_subtasks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
new file mode 100644
index 00000000..0d87803e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_multiple_targets_json
+include: ../greedy_until_template_yaml
+task: bigbench_simple_arithmetic_multiple_targets_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
new file mode 100644
index 00000000..2332985c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_ethical_questions
+include: ../greedy_until_template_yaml
+task: bigbench_simple_ethical_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
new file mode 100644
index 00000000..1d9943e4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_text_editing
+include: ../greedy_until_template_yaml
+task: bigbench_simple_text_editing_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
new file mode 100644
index 00000000..e98308e1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: snarks
+include: ../greedy_until_template_yaml
+task: bigbench_snarks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
new file mode 100644
index 00000000..7af09b30
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_iqa
+include: ../greedy_until_template_yaml
+task: bigbench_social_iqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
new file mode 100644
index 00000000..8e34e758
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_support
+include: ../greedy_until_template_yaml
+task: bigbench_social_support_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
new file mode 100644
index 00000000..3ae80c24
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sports_understanding
+include: ../greedy_until_template_yaml
+task: bigbench_sports_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
new file mode 100644
index 00000000..b6020b08
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strange_stories
+include: ../greedy_until_template_yaml
+task: bigbench_strange_stories_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
new file mode 100644
index 00000000..066c89d1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strategyqa
+include: ../greedy_until_template_yaml
+task: bigbench_strategyqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
new file mode 100644
index 00000000..27ef04dd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sufficient_information
+include: ../greedy_until_template_yaml
+task: bigbench_sufficient_information_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
new file mode 100644
index 00000000..a7496025
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: suicide_risk
+include: ../greedy_until_template_yaml
+task: bigbench_suicide_risk_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
new file mode 100644
index 00000000..25e7dfe6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swahili_english_proverbs
+include: ../greedy_until_template_yaml
+task: bigbench_swahili_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
new file mode 100644
index 00000000..1beebb17
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swedish_to_german_proverbs
+include: ../greedy_until_template_yaml
+task: bigbench_swedish_to_german_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
new file mode 100644
index 00000000..27b29a05
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: symbol_interpretation
+include: ../greedy_until_template_yaml
+task: bigbench_symbol_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
new file mode 100644
index 00000000..6ed42414
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: temporal_sequences
+include: ../greedy_until_template_yaml
+task: bigbench_temporal_sequences_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tense.yaml b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
new file mode 100644
index 00000000..49adc7c2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tense
+include: ../greedy_until_template_yaml
+task: bigbench_tense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
new file mode 100644
index 00000000..391dff43
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: timedial
+include: ../greedy_until_template_yaml
+task: bigbench_timedial_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
new file mode 100644
index 00000000..f9f1893f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: topical_chat
+include: ../greedy_until_template_yaml
+task: bigbench_topical_chat_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
new file mode 100644
index 00000000..675b0e37
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tracking_shuffled_objects
+include: ../greedy_until_template_yaml
+task: bigbench_tracking_shuffled_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
new file mode 100644
index 00000000..3c5ff40a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: understanding_fables
+include: ../greedy_until_template_yaml
+task: bigbench_understanding_fables_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
new file mode 100644
index 00000000..8e0c0699
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: undo_permutation
+include: ../greedy_until_template_yaml
+task: bigbench_undo_permutation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
new file mode 100644
index 00000000..384ccc05
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_conversion
+include: ../greedy_until_template_yaml
+task: bigbench_unit_conversion_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
new file mode 100644
index 00000000..a33bfd51
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_interpretation
+include: ../greedy_until_template_yaml
+task: bigbench_unit_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
new file mode 100644
index 00000000..cb3d2572
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unnatural_in_context_learning
+include: ../greedy_until_template_yaml
+task: bigbench_unnatural_in_context_learning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
new file mode 100644
index 00000000..67380ab9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vitaminc_fact_verification
+include: ../greedy_until_template_yaml
+task: bigbench_vitaminc_fact_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
new file mode 100644
index 00000000..baad0d9e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: what_is_the_tao
+include: ../greedy_until_template_yaml
+task: bigbench_what_is_the_tao_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
new file mode 100644
index 00000000..70047ee7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: which_wiki_edit
+include: ../greedy_until_template_yaml
+task: bigbench_which_wiki_edit_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
new file mode 100644
index 00000000..fff312b3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: winowhy
+include: ../greedy_until_template_yaml
+task: bigbench_winowhy_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
new file mode 100644
index 00000000..77b55d77
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_sorting
+include: ../greedy_until_template_yaml
+task: bigbench_word_sorting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
new file mode 100644
index 00000000..75fe7de9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_unscrambling
+include: ../greedy_until_template_yaml
+task: bigbench_word_unscrambling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
new file mode 100644
index 00000000..1d4e492b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
@@ -0,0 +1,14 @@
+group: bigbench
+dataset_path: bigbench
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: inputs
+doc_to_target: "{{targets[0]}}"
+generation_kwargs:
+  max_length: 128
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_punctuation: true
diff --git a/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
new file mode 100644
index 00000000..e815ad82
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding
+include: ../multiple_choice_template_yaml
+task: bigbench_abstract_narrative_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml b/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
new file mode 100644
index 00000000..0edb33ae
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: anachronisms
+include: ../multiple_choice_template_yaml
+task: bigbench_anachronisms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml b/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
new file mode 100644
index 00000000..4a63e23a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analogical_similarity
+include: ../multiple_choice_template_yaml
+task: bigbench_analogical_similarity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml b/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
new file mode 100644
index 00000000..3503337d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analytic_entailment
+include: ../multiple_choice_template_yaml
+task: bigbench_analytic_entailment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml b/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
new file mode 100644
index 00000000..a7af2d17
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: arithmetic
+include: ../multiple_choice_template_yaml
+task: bigbench_arithmetic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
new file mode 100644
index 00000000..9eca1362
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ascii_word_recognition
+include: ../multiple_choice_template_yaml
+task: bigbench_ascii_word_recognition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml b/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
new file mode 100644
index 00000000..0c49e8ee
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: authorship_verification
+include: ../multiple_choice_template_yaml
+task: bigbench_authorship_verification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml b/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
new file mode 100644
index 00000000..108cc802
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_categorization
+include: ../multiple_choice_template_yaml
+task: bigbench_auto_categorization_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml b/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
new file mode 100644
index 00000000..7ae0c2a5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_debugging
+include: ../multiple_choice_template_yaml
+task: bigbench_auto_debugging_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
new file mode 100644
index 00000000..6cb2bff4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bbq_lite_json
+include: ../multiple_choice_template_yaml
+task: bigbench_bbq_lite_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
new file mode 100644
index 00000000..33871759
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bridging_anaphora_resolution_barqa
+include: ../multiple_choice_template_yaml
+task: bigbench_bridging_anaphora_resolution_barqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml b/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
new file mode 100644
index 00000000..340e9bda
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: causal_judgment
+include: ../multiple_choice_template_yaml
+task: bigbench_causal_judgment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml b/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
new file mode 100644
index 00000000..4b3dd1a6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cause_and_effect
+include: ../multiple_choice_template_yaml
+task: bigbench_cause_and_effect_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
new file mode 100644
index 00000000..000360c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: checkmate_in_one
+include: ../multiple_choice_template_yaml
+task: bigbench_checkmate_in_one_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
new file mode 100644
index 00000000..b6d1f2e2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chess_state_tracking
+include: ../multiple_choice_template_yaml
+task: bigbench_chess_state_tracking_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
new file mode 100644
index 00000000..2552166c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chinese_remainder_theorem
+include: ../multiple_choice_template_yaml
+task: bigbench_chinese_remainder_theorem_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml b/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
new file mode 100644
index 00000000..a03e56c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cifar10_classification
+include: ../multiple_choice_template_yaml
+task: bigbench_cifar10_classification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml b/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
new file mode 100644
index 00000000..f2a33424
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: code_line_description
+include: ../multiple_choice_template_yaml
+task: bigbench_code_line_description_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml b/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
new file mode 100644
index 00000000..c03dc365
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: codenames
+include: ../multiple_choice_template_yaml
+task: bigbench_codenames_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/color.yaml b/lm_eval/tasks/bigbench/multiple_choice/color.yaml
new file mode 100644
index 00000000..f49710c7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/color.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: color
+include: ../multiple_choice_template_yaml
+task: bigbench_color_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml b/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
new file mode 100644
index 00000000..619c8eea
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: common_morpheme
+include: ../multiple_choice_template_yaml
+task: bigbench_common_morpheme_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
new file mode 100644
index 00000000..a7570bb0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conceptual_combinations
+include: ../multiple_choice_template_yaml
+task: bigbench_conceptual_combinations_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml b/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
new file mode 100644
index 00000000..4ff6ef02
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conlang_translation
+include: ../multiple_choice_template_yaml
+task: bigbench_conlang_translation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
new file mode 100644
index 00000000..4b34eec8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: contextual_parametric_knowledge_conflicts
+include: ../multiple_choice_template_yaml
+task: bigbench_contextual_parametric_knowledge_conflicts_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml b/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
new file mode 100644
index 00000000..2b0b9d46
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crash_blossom
+include: ../multiple_choice_template_yaml
+task: bigbench_crash_blossom_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml b/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
new file mode 100644
index 00000000..c203459a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crass_ai
+include: ../multiple_choice_template_yaml
+task: bigbench_crass_ai_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
new file mode 100644
index 00000000..c8cdd625
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryobiology_spanish
+include: ../multiple_choice_template_yaml
+task: bigbench_cryobiology_spanish_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml b/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
new file mode 100644
index 00000000..503cd601
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryptonite
+include: ../multiple_choice_template_yaml
+task: bigbench_cryptonite_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml b/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
new file mode 100644
index 00000000..bb9d90b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cs_algorithms
+include: ../multiple_choice_template_yaml
+task: bigbench_cs_algorithms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
new file mode 100644
index 00000000..cb00651a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dark_humor_detection
+include: ../multiple_choice_template_yaml
+task: bigbench_dark_humor_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
new file mode 100644
index 00000000..596a941e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: date_understanding
+include: ../multiple_choice_template_yaml
+task: bigbench_date_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
new file mode 100644
index 00000000..5264c21f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disambiguation_qa
+include: ../multiple_choice_template_yaml
+task: bigbench_disambiguation_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
new file mode 100644
index 00000000..151616c2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: discourse_marker_prediction
+include: ../multiple_choice_template_yaml
+task: bigbench_discourse_marker_prediction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
new file mode 100644
index 00000000..578df2a3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disfl_qa
+include: ../multiple_choice_template_yaml
+task: bigbench_disfl_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml b/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
new file mode 100644
index 00000000..07ecf4d4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dyck_languages
+include: ../multiple_choice_template_yaml
+task: bigbench_dyck_languages_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
new file mode 100644
index 00000000..d9e41204
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: elementary_math_qa
+include: ../multiple_choice_template_yaml
+task: bigbench_elementary_math_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml b/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
new file mode 100644
index 00000000..f6528de7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emoji_movie
+include: ../multiple_choice_template_yaml
+task: bigbench_emoji_movie_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
new file mode 100644
index 00000000..cedbd41c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emojis_emotion_prediction
+include: ../multiple_choice_template_yaml
+task: bigbench_emojis_emotion_prediction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml b/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
new file mode 100644
index 00000000..078a3c45
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: empirical_judgments
+include: ../multiple_choice_template_yaml
+task: bigbench_empirical_judgments_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
new file mode 100644
index 00000000..0dd3a6c6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_proverbs
+include: ../multiple_choice_template_yaml
+task: bigbench_english_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
new file mode 100644
index 00000000..12c7dae6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_russian_proverbs
+include: ../multiple_choice_template_yaml
+task: bigbench_english_russian_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
new file mode 100644
index 00000000..336a013e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity
+include: ../multiple_choice_template_yaml
+task: bigbench_entailed_polarity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
new file mode 100644
index 00000000..4d1bf0e8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_hindi
+include: ../multiple_choice_template_yaml
+task: bigbench_entailed_polarity_hindi_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
new file mode 100644
index 00000000..79827577
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: epistemic_reasoning
+include: ../multiple_choice_template_yaml
+task: bigbench_epistemic_reasoning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
new file mode 100644
index 00000000..f82cd899
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: evaluating_information_essentiality
+include: ../multiple_choice_template_yaml
+task: bigbench_evaluating_information_essentiality_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml b/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
new file mode 100644
index 00000000..2e20aabe
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fact_checker
+include: ../multiple_choice_template_yaml
+task: bigbench_fact_checker_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
new file mode 100644
index 00000000..e7931f2f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fantasy_reasoning
+include: ../multiple_choice_template_yaml
+task: bigbench_fantasy_reasoning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
new file mode 100644
index 00000000..593c4860
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: few_shot_nlg
+include: ../multiple_choice_template_yaml
+task: bigbench_few_shot_nlg_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
new file mode 100644
index 00000000..00f07670
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: figure_of_speech_detection
+include: ../multiple_choice_template_yaml
+task: bigbench_figure_of_speech_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
new file mode 100644
index 00000000..b2eb5aca
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: formal_fallacies_syllogisms_negation
+include: ../multiple_choice_template_yaml
+task: bigbench_formal_fallacies_syllogisms_negation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gem.yaml b/lm_eval/tasks/bigbench/multiple_choice/gem.yaml
new file mode 100644
index 00000000..5fd4caae
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/gem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gem
+include: ../multiple_choice_template_yaml
+task: bigbench_gem_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
new file mode 100644
index 00000000..77d16864
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gender_inclusive_sentences_german
+include: ../multiple_choice_template_yaml
+task: bigbench_gender_inclusive_sentences_german_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml b/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
new file mode 100644
index 00000000..021ad284
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: general_knowledge
+include: ../multiple_choice_template_yaml
+task: bigbench_general_knowledge_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml b/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
new file mode 100644
index 00000000..cfc2ada2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: geometric_shapes
+include: ../multiple_choice_template_yaml
+task: bigbench_geometric_shapes_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
new file mode 100644
index 00000000..e457887f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: goal_step_wikihow
+include: ../multiple_choice_template_yaml
+task: bigbench_goal_step_wikihow_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
new file mode 100644
index 00000000..8ec630d5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gre_reading_comprehension
+include: ../multiple_choice_template_yaml
+task: bigbench_gre_reading_comprehension_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml b/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
new file mode 100644
index 00000000..94272e8a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hhh_alignment
+include: ../multiple_choice_template_yaml
+task: bigbench_hhh_alignment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
new file mode 100644
index 00000000..0ab2cecd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindi_question_answering
+include: ../multiple_choice_template_yaml
+task: bigbench_hindi_question_answering_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
new file mode 100644
index 00000000..2d49951b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindu_knowledge
+include: ../multiple_choice_template_yaml
+task: bigbench_hindu_knowledge_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
new file mode 100644
index 00000000..4c7ca8bd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hinglish_toxicity
+include: ../multiple_choice_template_yaml
+task: bigbench_hinglish_toxicity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml b/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
new file mode 100644
index 00000000..d04bccc9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: human_organs_senses
+include: ../multiple_choice_template_yaml
+task: bigbench_human_organs_senses_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml b/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
new file mode 100644
index 00000000..9e15ffac
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hyperbaton
+include: ../multiple_choice_template_yaml
+task: bigbench_hyperbaton_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
new file mode 100644
index 00000000..dfb75722
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_math_theorems
+include: ../multiple_choice_template_yaml
+task: bigbench_identify_math_theorems_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
new file mode 100644
index 00000000..3657d3b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_odd_metaphor
+include: ../multiple_choice_template_yaml
+task: bigbench_identify_odd_metaphor_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml b/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
new file mode 100644
index 00000000..8c2d4c81
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicatures
+include: ../multiple_choice_template_yaml
+task: bigbench_implicatures_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml b/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
new file mode 100644
index 00000000..a837cdf3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicit_relations
+include: ../multiple_choice_template_yaml
+task: bigbench_implicit_relations_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml b/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
new file mode 100644
index 00000000..9d9cb82b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intent_recognition
+include: ../multiple_choice_template_yaml
+task: bigbench_intent_recognition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
new file mode 100644
index 00000000..715582af
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_nli
+include: ../multiple_choice_template_yaml
+task: bigbench_international_phonetic_alphabet_nli_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
new file mode 100644
index 00000000..cd6f6f71
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_transliterate
+include: ../multiple_choice_template_yaml
+task: bigbench_international_phonetic_alphabet_transliterate_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml b/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
new file mode 100644
index 00000000..d6448572
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intersect_geometry
+include: ../multiple_choice_template_yaml
+task: bigbench_intersect_geometry_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml b/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
new file mode 100644
index 00000000..bb8385f2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: irony_identification
+include: ../multiple_choice_template_yaml
+task: bigbench_irony_identification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml b/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
new file mode 100644
index 00000000..7ba101c8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kanji_ascii
+include: ../multiple_choice_template_yaml
+task: bigbench_kanji_ascii_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml b/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
new file mode 100644
index 00000000..e3767b21
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kannada
+include: ../multiple_choice_template_yaml
+task: bigbench_kannada_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml b/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
new file mode 100644
index 00000000..88c6bf5e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: key_value_maps
+include: ../multiple_choice_template_yaml
+task: bigbench_key_value_maps_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml b/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
new file mode 100644
index 00000000..de972c64
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: known_unknowns
+include: ../multiple_choice_template_yaml
+task: bigbench_known_unknowns_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml b/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
new file mode 100644
index 00000000..3e17fd8f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_games
+include: ../multiple_choice_template_yaml
+task: bigbench_language_games_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml b/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
new file mode 100644
index 00000000..e17cdc69
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_identification
+include: ../multiple_choice_template_yaml
+task: bigbench_language_identification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
new file mode 100644
index 00000000..118de388
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistic_mappings
+include: ../multiple_choice_template_yaml
+task: bigbench_linguistic_mappings_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
new file mode 100644
index 00000000..4799e672
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistics_puzzles
+include: ../multiple_choice_template_yaml
+task: bigbench_linguistics_puzzles_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml b/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
new file mode 100644
index 00000000..f2c94ada
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: list_functions
+include: ../multiple_choice_template_yaml
+task: bigbench_list_functions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
new file mode 100644
index 00000000..c24e71ac
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logic_grid_puzzle
+include: ../multiple_choice_template_yaml
+task: bigbench_logic_grid_puzzle_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
new file mode 100644
index 00000000..11e2771e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_args
+include: ../multiple_choice_template_yaml
+task: bigbench_logical_args_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
new file mode 100644
index 00000000..0de47251
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_deduction
+include: ../multiple_choice_template_yaml
+task: bigbench_logical_deduction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
new file mode 100644
index 00000000..b4d68c1b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_fallacy_detection
+include: ../multiple_choice_template_yaml
+task: bigbench_logical_fallacy_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
new file mode 100644
index 00000000..e58224b9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_sequence
+include: ../multiple_choice_template_yaml
+task: bigbench_logical_sequence_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml b/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
new file mode 100644
index 00000000..316b8eed
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mathematical_induction
+include: ../multiple_choice_template_yaml
+task: bigbench_mathematical_induction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml b/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
new file mode 100644
index 00000000..ebbc32f5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: matrixshapes
+include: ../multiple_choice_template_yaml
+task: bigbench_matrixshapes_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
new file mode 100644
index 00000000..2bbe0c00
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_boolean
+include: ../multiple_choice_template_yaml
+task: bigbench_metaphor_boolean_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
new file mode 100644
index 00000000..ae0fab49
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_understanding
+include: ../multiple_choice_template_yaml
+task: bigbench_metaphor_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
new file mode 100644
index 00000000..76b1bac0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: minute_mysteries_qa
+include: ../multiple_choice_template_yaml
+task: bigbench_minute_mysteries_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml b/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
new file mode 100644
index 00000000..dce2a5c2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions
+include: ../multiple_choice_template_yaml
+task: bigbench_misconceptions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
new file mode 100644
index 00000000..fca2b324
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_russian
+include: ../multiple_choice_template_yaml
+task: bigbench_misconceptions_russian_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml b/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
new file mode 100644
index 00000000..ac32701f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mnist_ascii
+include: ../multiple_choice_template_yaml
+task: bigbench_mnist_ascii_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
new file mode 100644
index 00000000..fd5c271a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: modified_arithmetic
+include: ../multiple_choice_template_yaml
+task: bigbench_modified_arithmetic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml b/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
new file mode 100644
index 00000000..95414745
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: moral_permissibility
+include: ../multiple_choice_template_yaml
+task: bigbench_moral_permissibility_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
new file mode 100644
index 00000000..831b261a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_dialog_same_or_different
+include: ../multiple_choice_template_yaml
+task: bigbench_movie_dialog_same_or_different_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml b/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
new file mode 100644
index 00000000..16d4ea55
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_recommendation
+include: ../multiple_choice_template_yaml
+task: bigbench_movie_recommendation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
new file mode 100644
index 00000000..b7693b06
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mult_data_wrangling
+include: ../multiple_choice_template_yaml
+task: bigbench_mult_data_wrangling_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml b/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
new file mode 100644
index 00000000..8c954b5d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: multiemo
+include: ../multiple_choice_template_yaml
+task: bigbench_multiemo_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml b/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
new file mode 100644
index 00000000..78d295c5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: natural_instructions
+include: ../multiple_choice_template_yaml
+task: bigbench_natural_instructions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml b/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
new file mode 100644
index 00000000..3bc9f120
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: navigate
+include: ../multiple_choice_template_yaml
+task: bigbench_navigate_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
new file mode 100644
index 00000000..7a7b2d80
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: nonsense_words_grammar
+include: ../multiple_choice_template_yaml
+task: bigbench_nonsense_words_grammar_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml b/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
new file mode 100644
index 00000000..04172c1a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: novel_concepts
+include: ../multiple_choice_template_yaml
+task: bigbench_novel_concepts_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml b/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
new file mode 100644
index 00000000..c6ab4011
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: object_counting
+include: ../multiple_choice_template_yaml
+task: bigbench_object_counting_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml b/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
new file mode 100644
index 00000000..82d70a63
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: odd_one_out
+include: ../multiple_choice_template_yaml
+task: bigbench_odd_one_out_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/operators.yaml b/lm_eval/tasks/bigbench/multiple_choice/operators.yaml
new file mode 100644
index 00000000..e48c8005
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/operators.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: operators
+include: ../multiple_choice_template_yaml
+task: bigbench_operators_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
new file mode 100644
index 00000000..3423101a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: paragraph_segmentation
+include: ../multiple_choice_template_yaml
+task: bigbench_paragraph_segmentation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
new file mode 100644
index 00000000..a2f65cde
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_qa
+include: ../multiple_choice_template_yaml
+task: bigbench_parsinlu_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
new file mode 100644
index 00000000..3f0f6182
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_reading_comprehension
+include: ../multiple_choice_template_yaml
+task: bigbench_parsinlu_reading_comprehension_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
new file mode 100644
index 00000000..ed4945f9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: penguins_in_a_table
+include: ../multiple_choice_template_yaml
+task: bigbench_penguins_in_a_table_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml b/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
new file mode 100644
index 00000000..5adb9422
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: periodic_elements
+include: ../multiple_choice_template_yaml
+task: bigbench_periodic_elements_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml b/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
new file mode 100644
index 00000000..c0ee240f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: persian_idioms
+include: ../multiple_choice_template_yaml
+task: bigbench_persian_idioms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
new file mode 100644
index 00000000..6231e5c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: phrase_relatedness
+include: ../multiple_choice_template_yaml
+task: bigbench_phrase_relatedness_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml b/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
new file mode 100644
index 00000000..50353ac7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physical_intuition
+include: ../multiple_choice_template_yaml
+task: bigbench_physical_intuition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physics.yaml b/lm_eval/tasks/bigbench/multiple_choice/physics.yaml
new file mode 100644
index 00000000..f3b4244e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/physics.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics
+include: ../multiple_choice_template_yaml
+task: bigbench_physics_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml b/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
new file mode 100644
index 00000000..ec5e9531
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_questions
+include: ../multiple_choice_template_yaml
+task: bigbench_physics_questions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
new file mode 100644
index 00000000..a81f33b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: play_dialog_same_or_different
+include: ../multiple_choice_template_yaml
+task: bigbench_play_dialog_same_or_different_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
new file mode 100644
index 00000000..af82fce2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: polish_sequence_labeling
+include: ../multiple_choice_template_yaml
+task: bigbench_polish_sequence_labeling_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
new file mode 100644
index 00000000..83b733a3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: presuppositions_as_nli
+include: ../multiple_choice_template_yaml
+task: bigbench_presuppositions_as_nli_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml b/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
new file mode 100644
index 00000000..5f52b44c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: qa_wikidata
+include: ../multiple_choice_template_yaml
+task: bigbench_qa_wikidata_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml b/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
new file mode 100644
index 00000000..1b4301bb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: question_selection
+include: ../multiple_choice_template_yaml
+task: bigbench_question_selection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
new file mode 100644
index 00000000..d41cd5dd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: real_or_fake_text
+include: ../multiple_choice_template_yaml
+task: bigbench_real_or_fake_text_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..e5e6f520
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: reasoning_about_colored_objects
+include: ../multiple_choice_template_yaml
+task: bigbench_reasoning_about_colored_objects_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
new file mode 100644
index 00000000..73406e70
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: repeat_copy_logic
+include: ../multiple_choice_template_yaml
+task: bigbench_repeat_copy_logic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml b/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
new file mode 100644
index 00000000..b785712c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: rephrase
+include: ../multiple_choice_template_yaml
+task: bigbench_rephrase_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml b/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
new file mode 100644
index 00000000..e8aff5b3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: riddle_sense
+include: ../multiple_choice_template_yaml
+task: bigbench_riddle_sense_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml b/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
new file mode 100644
index 00000000..7504f388
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ruin_names
+include: ../multiple_choice_template_yaml
+task: bigbench_ruin_names_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..a462eb15
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: salient_translation_error_detection
+include: ../multiple_choice_template_yaml
+task: bigbench_salient_translation_error_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml b/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
new file mode 100644
index 00000000..5ea881cd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: scientific_press_release
+include: ../multiple_choice_template_yaml
+task: bigbench_scientific_press_release_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
new file mode 100644
index 00000000..886b61be
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_in_context_sparc
+include: ../multiple_choice_template_yaml
+task: bigbench_semantic_parsing_in_context_sparc_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
new file mode 100644
index 00000000..cb5dc922
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_spider
+include: ../multiple_choice_template_yaml
+task: bigbench_semantic_parsing_spider_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
new file mode 100644
index 00000000..573f6199
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sentence_ambiguity
+include: ../multiple_choice_template_yaml
+task: bigbench_sentence_ambiguity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
new file mode 100644
index 00000000..1e0c4ffb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: similarities_abstraction
+include: ../multiple_choice_template_yaml
+task: bigbench_similarities_abstraction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
new file mode 100644
index 00000000..2e453821
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simp_turing_concept
+include: ../multiple_choice_template_yaml
+task: bigbench_simp_turing_concept_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
new file mode 100644
index 00000000..e5e24f58
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_arithmetic_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
new file mode 100644
index 00000000..4fb67ac5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_multiple_choice
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_arithmetic_json_multiple_choice_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
new file mode 100644
index 00000000..67853d68
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_subtasks
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_arithmetic_json_subtasks_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
new file mode 100644
index 00000000..b76bfbde
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_multiple_targets_json
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_arithmetic_multiple_targets_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
new file mode 100644
index 00000000..a8a10ca6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_ethical_questions
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_ethical_questions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
new file mode 100644
index 00000000..3bbecfb9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_text_editing
+include: ../multiple_choice_template_yaml
+task: bigbench_simple_text_editing_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml b/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
new file mode 100644
index 00000000..4e0b9d3a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: snarks
+include: ../multiple_choice_template_yaml
+task: bigbench_snarks_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
new file mode 100644
index 00000000..de12bcbd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_iqa
+include: ../multiple_choice_template_yaml
+task: bigbench_social_iqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml b/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
new file mode 100644
index 00000000..f2e8c795
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_support
+include: ../multiple_choice_template_yaml
+task: bigbench_social_support_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
new file mode 100644
index 00000000..4a3914a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sports_understanding
+include: ../multiple_choice_template_yaml
+task: bigbench_sports_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml b/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
new file mode 100644
index 00000000..f0882aa2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strange_stories
+include: ../multiple_choice_template_yaml
+task: bigbench_strange_stories_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
new file mode 100644
index 00000000..e99618c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strategyqa
+include: ../multiple_choice_template_yaml
+task: bigbench_strategyqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml b/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
new file mode 100644
index 00000000..56af1ae2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sufficient_information
+include: ../multiple_choice_template_yaml
+task: bigbench_sufficient_information_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml b/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
new file mode 100644
index 00000000..5c6f0cd2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: suicide_risk
+include: ../multiple_choice_template_yaml
+task: bigbench_suicide_risk_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
new file mode 100644
index 00000000..497980ae
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swahili_english_proverbs
+include: ../multiple_choice_template_yaml
+task: bigbench_swahili_english_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
new file mode 100644
index 00000000..46d49ddc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swedish_to_german_proverbs
+include: ../multiple_choice_template_yaml
+task: bigbench_swedish_to_german_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
new file mode 100644
index 00000000..a6032ad9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: symbol_interpretation
+include: ../multiple_choice_template_yaml
+task: bigbench_symbol_interpretation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml b/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
new file mode 100644
index 00000000..4a63b2ac
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: temporal_sequences
+include: ../multiple_choice_template_yaml
+task: bigbench_temporal_sequences_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/tense.yaml b/lm_eval/tasks/bigbench/multiple_choice/tense.yaml
new file mode 100644
index 00000000..4fce296d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/tense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tense
+include: ../multiple_choice_template_yaml
+task: bigbench_tense_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml b/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
new file mode 100644
index 00000000..550d1190
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: timedial
+include: ../multiple_choice_template_yaml
+task: bigbench_timedial_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml b/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
new file mode 100644
index 00000000..232dc706
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: topical_chat
+include: ../multiple_choice_template_yaml
+task: bigbench_topical_chat_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
new file mode 100644
index 00000000..8dd68282
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tracking_shuffled_objects
+include: ../multiple_choice_template_yaml
+task: bigbench_tracking_shuffled_objects_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml b/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
new file mode 100644
index 00000000..d85d63b1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: understanding_fables
+include: ../multiple_choice_template_yaml
+task: bigbench_understanding_fables_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml b/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
new file mode 100644
index 00000000..0e92a41f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: undo_permutation
+include: ../multiple_choice_template_yaml
+task: bigbench_undo_permutation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml b/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
new file mode 100644
index 00000000..b4d421e2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_conversion
+include: ../multiple_choice_template_yaml
+task: bigbench_unit_conversion_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml b/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
new file mode 100644
index 00000000..eb60bc42
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_interpretation
+include: ../multiple_choice_template_yaml
+task: bigbench_unit_interpretation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
new file mode 100644
index 00000000..47c5b755
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unnatural_in_context_learning
+include: ../multiple_choice_template_yaml
+task: bigbench_unnatural_in_context_learning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
new file mode 100644
index 00000000..3ddb5e69
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vitaminc_fact_verification
+include: ../multiple_choice_template_yaml
+task: bigbench_vitaminc_fact_verification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
new file mode 100644
index 00000000..dda9a695
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: what_is_the_tao
+include: ../multiple_choice_template_yaml
+task: bigbench_what_is_the_tao_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
new file mode 100644
index 00000000..a6a5bbbf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: which_wiki_edit
+include: ../multiple_choice_template_yaml
+task: bigbench_which_wiki_edit_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml b/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
new file mode 100644
index 00000000..0b0a858c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: winowhy
+include: ../multiple_choice_template_yaml
+task: bigbench_winowhy_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml b/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
new file mode 100644
index 00000000..c244f547
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_sorting
+include: ../multiple_choice_template_yaml
+task: bigbench_word_sorting_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml b/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
new file mode 100644
index 00000000..a993ef33
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_unscrambling
+include: ../multiple_choice_template_yaml
+task: bigbench_word_unscrambling_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
new file mode 100644
index 00000000..3dd2af61
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -0,0 +1,10 @@
+group: bigbench
+dataset_path: bigbench
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: inputs
+doc_to_target: "{{multiple_choice_targets.index(targets[0])}}"
+doc_to_choice: "{{multiple_choice_targets}}"
+metric_list:
+  - metric: acc
-- 
GitLab


From 76227f0dbec309c7124de09a9b7ce4009f8bb8f1 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 21 Sep 2023 14:07:58 +0000
Subject: [PATCH 040/212] add codexglue

---
 lm_eval/tasks/code_x_glue/code-text/bleu.py  | 218 ++++++++++---------
 lm_eval/tasks/code_x_glue/code-text/utils.py |  12 +-
 2 files changed, 126 insertions(+), 104 deletions(-)

diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py
index 50243474..aff16afe 100644
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -1,12 +1,12 @@
 #!/usr/bin/python
 
-'''
-This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
-'''
+"""
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
+"""
 
 # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
 
-'''Provides:
+"""Provides:
 
 cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
@@ -15,7 +15,7 @@ score_cooked(alltest, n=4): Score a list of cooked test sentences.
 score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
 
 The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
-'''
+"""
 
 import sys, math, re, xml.sax.saxutils
 import subprocess
@@ -28,167 +28,188 @@ preserve_case = False
 eff_ref_len = "shortest"
 
 normalize1 = [
-    ('<skipped>', ''),         # strip "skipped" tags
-    (r'-\n', ''),              # strip end-of-line hyphenation and join lines
-    (r'\n', ' '),              # join lines
-#    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+    ("<skipped>", ""),  # strip "skipped" tags
+    (r"-\n", ""),  # strip end-of-line hyphenation and join lines
+    (r"\n", " "),  # join lines
+    #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
 ]
 normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
 
 normalize2 = [
-    (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
-    (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
-    (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
-    (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
+    (
+        r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
+        r" \1 ",
+    ),  # tokenize punctuation. apostrophe is missing
+    (
+        r"([^0-9])([\.,])",
+        r"\1 \2 ",
+    ),  # tokenize period and comma unless preceded by a digit
+    (
+        r"([\.,])([^0-9])",
+        r" \1 \2",
+    ),  # tokenize period and comma unless followed by a digit
+    (r"([0-9])(-)", r"\1 \2 "),  # tokenize dash when preceded by a digit
 ]
 normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
 
+
 def normalize(s):
-    '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
+    """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
     # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
-    if (nonorm):
+    if nonorm:
         return s.split()
     if type(s) is not str:
         s = " ".join(s)
     # language-independent part:
     for (pattern, replace) in normalize1:
         s = re.sub(pattern, replace, s)
-    s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
+    s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
     # language-dependent part (assuming Western languages):
     s = " %s " % s
     if not preserve_case:
-        s = s.lower()         # this might not be identical to the original
+        s = s.lower()  # this might not be identical to the original
     for (pattern, replace) in normalize2:
         s = re.sub(pattern, replace, s)
     return s.split()
 
+
 def count_ngrams(words, n=4):
     counts = {}
-    for k in range(1,n+1):
-        for i in range(len(words)-k+1):
-            ngram = tuple(words[i:i+k])
-            counts[ngram] = counts.get(ngram, 0)+1
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i : i + k])
+            counts[ngram] = counts.get(ngram, 0) + 1
     return counts
 
+
 def cook_refs(refs, n=4):
-    '''Takes a list of reference sentences for a single segment
+    """Takes a list of reference sentences for a single segment
     and returns an object that encapsulates everything that BLEU
-    needs to know about them.'''
-    
+    needs to know about them."""
+
     refs = [normalize(ref) for ref in refs]
     maxcounts = {}
     for ref in refs:
         counts = count_ngrams(ref, n)
-        for (ngram,count) in counts.items():
-            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
     return ([len(ref) for ref in refs], maxcounts)
 
+
 def cook_test(test, item, n=4):
-    '''Takes a test sentence and returns an object that
-    encapsulates everything that BLEU needs to know about it.'''
-    (reflens, refmaxcounts)=item
+    """Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it."""
+    (reflens, refmaxcounts) = item
     test = normalize(test)
     result = {}
     result["testlen"] = len(test)
 
     # Calculate effective reference sentence length.
-    
+
     if eff_ref_len == "shortest":
         result["reflen"] = min(reflens)
     elif eff_ref_len == "average":
-        result["reflen"] = float(sum(reflens))/len(reflens)
+        result["reflen"] = float(sum(reflens)) / len(reflens)
     elif eff_ref_len == "closest":
         min_diff = None
         for reflen in reflens:
-            if min_diff is None or abs(reflen-len(test)) < min_diff:
-                min_diff = abs(reflen-len(test))
-                result['reflen'] = reflen
+            if min_diff is None or abs(reflen - len(test)) < min_diff:
+                min_diff = abs(reflen - len(test))
+                result["reflen"] = reflen
 
-    result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
+    result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
 
-    result['correct'] = [0]*n
+    result["correct"] = [0] * n
     counts = count_ngrams(test, n)
     for (ngram, count) in counts.items():
-        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 
     return result
 
+
 def score_cooked(allcomps, n=4, ground=0, smooth=1):
-    totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+    totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n}
     for comps in allcomps:
-        for key in ['testlen','reflen']:
+        for key in ["testlen", "reflen"]:
             totalcomps[key] += comps[key]
-        for key in ['guess','correct']:
+        for key in ["guess", "correct"]:
             for k in range(n):
                 totalcomps[key][k] += comps[key][k]
     logbleu = 0.0
     all_bleus = []
     for k in range(n):
-      correct = totalcomps['correct'][k]
-      guess = totalcomps['guess'][k]
-      addsmooth = 0
-      if smooth == 1 and k > 0:
-        addsmooth = 1
-      logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
-      if guess == 0:
-        all_bleus.append(-10000000)
-      else:
-        all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
+        correct = totalcomps["correct"][k]
+        guess = totalcomps["guess"][k]
+        addsmooth = 0
+        if smooth == 1 and k > 0:
+            addsmooth = 1
+        logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
+            guess + addsmooth + sys.float_info.min
+        )
+        if guess == 0:
+            all_bleus.append(-10000000)
+        else:
+            all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
 
     logbleu /= float(n)
     all_bleus.insert(0, logbleu)
 
-    brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
+    brevPenalty = min(
+        0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
+    )
     for i in range(len(all_bleus)):
-      if i ==0:
-        all_bleus[i] += brevPenalty
-      all_bleus[i] = math.exp(all_bleus[i])
+        if i == 0:
+            all_bleus[i] += brevPenalty
+        all_bleus[i] = math.exp(all_bleus[i])
     return all_bleus
 
-def bleu(refs,  candidate, ground=0, smooth=1):
+
+def bleu(refs, candidate, ground=0, smooth=1):
     refs = cook_refs(refs)
     test = cook_test(candidate, refs)
     return score_cooked([test], ground=ground, smooth=smooth)
 
+
 def splitPuncts(line):
-  return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
+    return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
+
 
 def computeMaps(predictions, goldfile):
-  predictionMap = {}
-  goldMap = {}
-  gf = open(goldfile, 'r')
-
-  for row in predictions:
-    cols = row.strip().split('\t')
-    if len(cols) == 1:
-      (rid, pred) = (cols[0], '') 
-    else:
-      (rid, pred) = (cols[0], cols[1]) 
-    predictionMap[rid] = [splitPuncts(pred.strip().lower())]
-
-  for row in gf:
-    (rid, pred) = row.split('\t') 
-    if rid in predictionMap: # Only insert if the id exists for the method
-      if rid not in goldMap:
-        goldMap[rid] = []
-      goldMap[rid].append(splitPuncts(pred.strip().lower()))
-
-  sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
-  return (goldMap, predictionMap)
-
-
-#m1 is the reference map
-#m2 is the prediction map
+    predictionMap = {}
+    goldMap = {}
+    gf = open(goldfile, "r")
+
+    for row in predictions:
+        cols = row.strip().split("\t")
+        if len(cols) == 1:
+            (rid, pred) = (cols[0], "")
+        else:
+            (rid, pred) = (cols[0], cols[1])
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for row in gf:
+        (rid, pred) = row.split("\t")
+        if rid in predictionMap:  # Only insert if the id exists for the method
+            if rid not in goldMap:
+                goldMap[rid] = []
+            goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+    sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
+    return (goldMap, predictionMap)
+
+
+# m1 is the reference map
+# m2 is the prediction map
 def bleuFromMaps(m1, m2):
-  score = [0] * 5
-  num = 0.0
+    score = [0] * 5
+    num = 0.0
 
-  for key in m1:
-    if key in m2:
-      bl = bleu(m1[key], m2[key][0])
-      score = [ score[i] + bl[i] for i in range(0, len(bl))]
-      num += 1
-  return [s * 100.0 / num for s in score]
+    for key in m1:
+        if key in m2:
+            bl = bleu(m1[key], m2[key][0])
+            score = [score[i] + bl[i] for i in range(0, len(bl))]
+            num += 1
+    return [s * 100.0 / num for s in score]
 
 
 def smoothed_bleu_4(references, predictions, **kwargs):
@@ -197,17 +218,18 @@ def smoothed_bleu_4(references, predictions, **kwargs):
     goldMap = {}
 
     for rid, pred in enumerate(predictions):
-      predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
 
     for rid, row in enumerate(references):
-      goldMap[rid] = [splitPuncts(row.strip().lower())]
+        goldMap[rid] = [splitPuncts(row.strip().lower())]
 
     return bleuFromMaps(goldMap, predictionMap)[0]
 
-if __name__ == '__main__':
-  reference_file = sys.argv[1]
-  predictions = []
-  for row in sys.stdin:
-    predictions.append(row)
-  (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
-  print (bleuFromMaps(goldMap, predictionMap)[0])
+
+if __name__ == "__main__":
+    reference_file = sys.argv[1]
+    predictions = []
+    for row in sys.stdin:
+        predictions.append(row)
+    (goldMap, predictionMap) = computeMaps(predictions, reference_file)
+    print(bleuFromMaps(goldMap, predictionMap)[0])
diff --git a/lm_eval/tasks/code_x_glue/code-text/utils.py b/lm_eval/tasks/code_x_glue/code-text/utils.py
index 89cbbdf3..981a00b9 100644
--- a/lm_eval/tasks/code_x_glue/code-text/utils.py
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
@@ -1,14 +1,14 @@
-
 def doc_to_text(doc):
 
-    inputs = ' '.join(doc['code_tokens']).replace('\n',' ')
-    inputs = ' '.join(inputs.strip().split())
+    inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
+    inputs = " ".join(inputs.strip().split())
 
     return inputs
 
+
 def doc_to_target(doc):
 
-    targets = ' '.join(doc['docstring_tokens']).replace('\n','')
-    targets = ' '.join(targets.strip().split())     
+    targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
+    targets = " ".join(targets.strip().split())
 
-    return targets
\ No newline at end of file
+    return targets
-- 
GitLab


From f0d8b559a9fcca7f115bea8e5ed13996749aeba8 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 21 Sep 2023 14:08:21 +0000
Subject: [PATCH 041/212] update

---
 lm_eval/tasks/bbh/_generate_configs.py        | 44 +++++++++----------
 .../_flan_cot_fewshot_template_yaml           |  2 +-
 .../_flan_cot_zeroshot_template_yaml          |  2 +-
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index ae2fb38f..0c882af0 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -13,18 +13,13 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger
 
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument(
-        "--save_prefix_path", default="flan_zeroshot"
-    )
-    parser.add_argument(
-        "--cot", default=False
-    )
-    parser.add_argument(
-        "--fewshot", default=False
-    )
+    parser.add_argument("--save_prefix_path", default="flan_zeroshot")
+    parser.add_argument("--cot", default=False)
+    parser.add_argument("--fewshot", default=False)
     parser.add_argument("--task_prefix", default="")
     return parser.parse_args()
 
@@ -44,7 +39,9 @@ if __name__ == "__main__":
     dataset_path = "lukaemon/bbh"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
 
-        resp = requests.get(f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt").content.decode('utf-8')
+        resp = requests.get(
+            f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
+        ).content.decode("utf-8")
         prompt = resp.split("\n-----\n")[-1]
         description, *few_shot = prompt.split("\n\nQ:")
 
@@ -54,13 +51,13 @@ if __name__ == "__main__":
                 prefix_doc_to_text = " ".join(few_shot)
             else:
                 for shot in few_shot:
-                    shot = "Q:"+shot
+                    shot = "Q:" + shot
                     try:
                         answer = answer_regex.search(shot)[0]
                     except:
                         print("task", task)
                         print(shot)
-                    example = shot.split("Let\'s think step by step.")[0]
+                    example = shot.split("Let's think step by step.")[0]
                     prefix_doc_to_text += f"{example}{answer}\n\n"
 
         doc_to_text = prefix_doc_to_text + base_doc_to_text
@@ -68,17 +65,20 @@ if __name__ == "__main__":
             doc_to_text = doc_to_text + " Let's think step by step.\n"
 
         yaml_dict = {
-                "include": base_yaml_name,
-                "task": f"bbh_{args.task_prefix}_{task}",
-                "dataset_name": task,
-                "description": description+"\n\n",
-                "doc_to_text": doc_to_text,
-            }
+            "include": base_yaml_name,
+            "task": f"bbh_{args.task_prefix}_{task}",
+            "dataset_name": task,
+            "description": description + "\n\n",
+            "doc_to_text": doc_to_text,
+        }
 
         file_save_path = args.save_prefix_path + f"/{task}.yaml"
         eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
         with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
-
-
-
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index 680c2533..2e2e8bc9 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -19,4 +19,4 @@ filter_list:
     filter:
       - function: "regex"
         regex_pattern: "(?<=the answer is )(.*)(?=.)"
-      - function: "take_first"
\ No newline at end of file
+      - function: "take_first"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index 66ab12e3..b6574a4e 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -19,4 +19,4 @@ filter_list:
     filter:
       - function: "regex"
         regex_pattern: "(?<=the answer is )(.*)(?=.)"
-      - function: "take_first"
\ No newline at end of file
+      - function: "take_first"
-- 
GitLab


From ea8b5beb04a705f63401969bcfd3ffa02f216645 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 21 Sep 2023 14:08:39 +0000
Subject: [PATCH 042/212] update

---
 lm_eval/tasks/mmlu/_cot_prompts.json          |  2 +-
 lm_eval/tasks/mmlu/_generate_configs.py       | 21 ++++++++++++-------
 .../tasks/mmlu/default/_default_template_yaml |  2 +-
 .../_mmlu_flan_cot_fewshot_template_yaml      |  2 +-
 .../_mmlu_flan_loglikelihood_template_yaml    |  2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/lm_eval/tasks/mmlu/_cot_prompts.json b/lm_eval/tasks/mmlu/_cot_prompts.json
index 4714567a..fea25419 100644
--- a/lm_eval/tasks/mmlu/_cot_prompts.json
+++ b/lm_eval/tasks/mmlu/_cot_prompts.json
@@ -1 +1 @@
-{"abstract_algebra": "The following are multiple choice questions (with answers) about abstract algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. A cyclic group is a group that is generated by a single element. Hence a subgroup generated by a single element of a group is cyclic and Statement 1 is True. The answer is (C).\n\nQ: The symmetric group $S_n$ has $\nactorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\n(A) 0 (B) 3 (C) 12 (D) 30\nA: Let's think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).\n\nQ: Statement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 1 is true. Let $S$ be a finite set. If $f:S \nightarrow S$ is a onto function, then $|S| = |f(S)|$. If $f$ was not one to one, then for finite domain $S$ the image would have less than $S$ elements, a contradiction.\nStatement 2 is true. Let $G$ be an abelian group and $H$ be a subgroup of $G$. We need to show that $H$ is abelian. Let $a,b \\in H$. Then $a,b \\in G$ and $ab=ba$. Since $G$ is abelian, $ab=ba$. Since $H$ is a subgroup of $G$, $ab \\in H$. Therefore, $ab=ba$ and $H$ is abelian. The answer is (A).\n\nQ: Statement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 2 is false. Let $H$ be a subgroup of $S_3$ generated by the cycle $(1,2)$ and $K$ be a subgroup of $S_3$ generated by the cycle $(1,3)$. Both $H$ and $K$ have two elements, the generators and the identity. However $HK$ contains cycles (1,2), (1,3) and (2,3,1), but the inverse of (2,3,1) is (2,1,3) and it does not belong to HK, hence HK is not a subgroup. The answer is (B).\n\nQ: Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\n(A) 0 (B) 1 (C) 2 (D) 3\nA: Let's think step by step. Z_3[x]/(x^2 + c) is a field if and only if x^2 + c does not have roots in Z_3. That is x^2 + c != 0 for every x in Z_3. If c = 0, then x^2 + c = x^2 has root 0. If c = 1 then x^2 + c = x^2 + 1 = 0 + 1 for x = 0, 1 + 1 = 2 for x = 1 and 1 + 1 = 2 for x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2 has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only if c = 1. The answer is (B).", "anatomy": "The following are multiple choice questions (with answers) about anatomy.\n\nQ: Which of the following is the body cavity that contains the pituitary gland?\n(A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The pituitary gland is the major endocrine gland attached to the base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n(A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. \nWe know the following: (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve. (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal nerve. Because all these nerves are purely sensory nerves and do not contain any somatic motor processes. Therefore, the answer should be none of the above, which is (D). The answer is (D).\n\nQ: In Angle's Class II Div 2 occlusion there is\n(A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper central incisors. (C) excess overjet of the upper lateral incisors. (D) excess overjet of the upper central incisors.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. This is a question related to anatomy and orthodontics. Excess overjet is associated with Class II occlusions; therefore, we can safely eliminate (B) from the list, as negative overjet is often associated with Class III occlusions. Now, we need to determine the location of the excess overjet, and that would be the upper (maxillary) lateral incisors. Only (C) has the correct information. The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. First, recall that the pleura refers to the thin layer of tissue that covers the lungs and lines the interior wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201cThe pleura have no sensory innervation.\u201d This information is not correct. The pleura do have a sensory innervation.\nOption (B): \u201cThe pleura are separated by a 2 mm space.\u201d This information is not correct. There is a very thin \u201cpotential\u201d space between the layers of the pleura; however, it is typically filled with serous pleural fluid. \nOption (C): \u201cThe pleura extend into the neck.\u201d This information is actuakky true. The cervical pleura, also known as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural cavity into the neck.\nOption (D): \u201cThe pleura are composed of respiratory epithelium.\u201d This information is not correct. The pleaura are composed of connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the only correct answer. The answer is (C).\n\nQ: What is the embryological origin of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The hyoid bone, which is also known as the hyooid, is a a small U-shaped bone located in the anterior neck. In its resting position, it lies between the ase of the mandible and the third cervical vertebrae. We know that the second and the third pharyngeal arches give rise to the horns of the hyoid bone; therefore, the embryological origin of the hyoid bone are the second and the third pharyngeal arches\u2014this information is covered in the last option (D). Therefore, we conclude that (D) must be the correct answer. The answer is (D).", "astronomy": "The following are multiple choice questions (with answers) about astronomy.\n\nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the Kuiper belt. (C) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt. (D) The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud.\nA: Let's think step by step. Most short-period comets come from the Kuiper belt, and we know because short period coments tend to be in the plane of the solar system, just like the Kuiper belt is. The answer is (A).\n\nQ: You are pushing a truck along a road. Would it be easier to accelerate this truck on Mars? Why? (Assume there is no friction)\n(A) It would be harder since the truck is heavier on Mars. (B) It would be easier since the truck is lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D) It would be the same no matter where you are.\nA: Let's think step by step. If we assume that there is no friction, the force needed to accelerate the truck is by Newton\u2019s second law only dependent on the mass of the truck. Hence (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?\n(A) 10000 times more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think step by step. The amount of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic collision. (B) There was not enough material in this part of the solar nebula to form a planet. (C) There was too much rocky material to form a terrestrial planet but not enough gaseous material to form a jovian planet. (D) Resonance with Jupiter prevented material from collecting together to form a planet.\nA: Let's think step by step. The asteroid belt is a stellar disc consisting of a large number of asteroids between Mars and Jupiter's orbits. The asteroids in this belt are affected by the gravitational pull from both other asteroids and nearby planets. Due to the strong gravitational force of Jupiter there are resonances that give rise to low density regions of asteroids known as the Kirkwood gap. So (B) and (C) are not correct since it\u2019s not a lack of material that prevents a planet from being formed, and (A) is incorrect because the Kirkwood gap would have prevented a planet from forming in the first place, and (D) is the correct option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere scatters more light at bluer wavelengths transmitting mostly red light. (C) Because Mars is covered with ancient lava flows which are red in color. (D) Because flowing water on Mars's surface altered the surface minerals several billion years ago.\nA: Let's think step by step. Option (B) is not correct because if the red color was caused by the scattering off the atmosphere, then the earth with a much thicker atmosphere would also look red. Options (C) and (D) are not specific enough about why the color of the surface would be red, while (A) is correct because it explains that the surface is red due to the rusted materials on the surface and the red color comes from the rust. So the correct option is (A). The answer is (A).", "business_ethics": "The following are multiple choice questions (with answers) about business ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .\n(A) Buycotts, Boycotts, Blockchain technology, Charitable donations (B) Buycotts, Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased Sales\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cIn contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies. The success of such campaigns have been heightened through the use of *digital technology*, which allow campaigns to facilitate the company in achieving *increased sales*.\u201d The answer is (D).\n\nQ: _______ is the direct attempt to formally or informally manage ethical issues or problems, through specific policies, practices and programmes.\n(A) Corporate social responsibility (B) Business ethics management (C) Sustainability (D) Environmental management\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The direct attempt manage ethical issues through specific policies, practices, and programs is business ethics management. The answer is (B).\n\nQ: Three contrasting tactics that CSO's can engage in to meet their aims are ________ which typically involves research and communication, ________, which may involve physically attacking a company's operations or ________, often involving some form of _______.\n(A) Non-violent direct action, Violent direct action, Indirect action, Boycott (B) Indirect action, Instrumental action, Non-violent direct action, Information campaign (C) Indirect action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent direct action, Instrumental action, Indirect action, Information campaign\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cThree contrasting tactics that CSO's can engage in to meet their aims are *indirect action*, which typically involves research and communication, *violent direct action*, which may involve physically attacking a company's operations or *non-violent direct action*, often involving some form of *boycott*.\u201d The answer is (C).\n\nQ: To ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being drawn from _______ the company, being appointed for a _________ time period as well as being appointed _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cTo ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being draw from *outside* the company, being appointed for a *limited* time period as well as being imported *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging in CSR there are a number of moral arguments relating to: negative _______, the _______that corporations possess and the ________ of business and society.\n(A) Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual dependence\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cBeyond the business case for engaging the CSR there are a number of moral arguments relating to: negative *externalities*, the *power* that corporations possess and the *mutual independence* of business and society. The answer is (D).", "clinical_knowledge": "The following are multiple choice questions (with answers) about clinical knowledge.\n\nQ: Glycolysis is the name given to the pathway involving the conversion of:\n(A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl CoA.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Glycolysis is the name given to the pathway involving conversion of glycogen or glucose to pyruvate or lactate. The answer is (C).\n\nQ: What is the difference between a male and a female catheter?\n(A) Male and female catheters are different colours. (B) Male catheters are longer than female catheters. (C) Male catheters are bigger than female catheters. (D) Female catheters are longer than male catheters.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The difference between a male and female catheter is that male catheters tend to be longer than female catheters. The answer is (B).\n\nQ: How many attempts should you make to cannulate a patient before passing the job on to a senior colleague, according to the medical knowledge of 2020?\n(A) 4 (B) 3 (C) 2 (D) 1\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. According to the medical protocol as of 2020, you should make two attempts to cannulate a patient before passing the job on to a more-senior practitioner. The answer is (C).\n\nQ: In the assessment of the hand function which of the following is true?\n(A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied by the median nerve (D) Finger abduction is mediated by the palmar interossei\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Of all the options, it is only true that the opposition of the thumb by opponens pollicis is supplied by spinal root T1. The answer is (B).\n\nQ: The energy for all forms of muscle contraction is provided by:\n(A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate), which is the powerhouse of the cell. The answer is (A).", "college_biology": "The following are multiple choice questions (with answers) about college biology.\n\nQ: Which of the following represents an accurate statement concerning arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan. (B) They possess an open circulatory system with a dorsal heart. (C) They are members of a biologically unsuccessful phylum incapable of exploiting diverse habitats and nutrition sources. (D) They lack paired, jointed appendages.\nA: Let's think step by step. Peptidoglycan is known to comprise the plasma membrane of most bacteria, rather than the exoskeleton of arthropods, which is made of chitin, which rules out (A). The answer (C) is false because arthropods are a highly successful phylum. Likewise, arthropods have paired, jointed appendages, which rules out (D). The only remaining option is (B), as arthropods have an open circulatory system with a dorsal tubular heart. The answer is (B).\n\nQ: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?\n(A) 1/400 (B) 19/400 (C) 20/400 (D) 38/400\nA: Let's think step by step. According to the Hardy Weinberg Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$, and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and heterozygous individuals, respectively. \u200bThe frequency of the recessive allele (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The number of heterozygous individuals is equal to the frequency of heterozygous individuals times the size of the population, or $0.095 * 400 = 38$. So we end up with 38/400. The answer is (D).\n\nQ: According to the pressure-flow model of movement of phloem contents, photosynthate movement from source to sink is driven by\n(A) an ATP-dependent pressure-flow pump (B) a water-pressure potential gradient (C) transpiration (D) apoplastic diffusion\nA: Let's think step by step. It is a gradient in water pressure that induces the movement of phloem content, which refers to answer (B). The mechanism of movement does not rely on metabolism, which rules out (A). Transpiration refers to the exhalation of water vapor through plant stomata, and is also not related, which rules out (C). While the apoplastic pathway is one of two main pathways for water transport in plants, it is not central to the pressure flow model, which rules out (D). The answer is (B).\n\nQ: Which of the following contain DNA sequences required for the segregation of chromosomes in mitosis and meiosis?\n(A) Telomeres (B) Centromeres (C) Nucleosomes (D) Spliceosomes\nA: Let's think step by step. The genetic material in Telomeres is not used, which rules out (A). Nucleosomes are the repeating subunit that comprises chromatin packed in a cell nucleus, and do not specifically refer to DNA sequences necessary for segregating chromosomes in cell division, which rules out (C). A spliceosome is a large ribonucleoprotein that removes introns from transcribed pre-mRNA rather than governing chromosome segregation. Centromeres are directly responsible for segregating chromosomes in cell division. The answer is (B).\n\nQ: The presence of homologous structures in two different organisms, such as the humerus in the front limb of a human and a bird, indicates that\n(A) the human and bird are polyphyletic species (B) a human's and bird's evolution is convergent (C) the human and bird belong to a clade (D) the human and bird developed by analogy\nA: Let's think step by step. Polyphyletic species are organisms that are grouped due to having similar characteristics but which do not have a common ancestor. This is not the case for humans and birds, which rules out (A). Convergent evolution refers to the indepdendent development of similar features in different species at different periods, which is also not the case for humans and birds, which rules out (B). Analogy refers to the superficial resemblance of structures that have different origins, which is not the case for the human and bird forearms, which rules out (D). Humans and birds do belong to the same clade - a group of organisms composed of a common ancestor. The answer is (C).", "college_chemistry": "The following are multiple choice questions (with answers) about college chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq) + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n(A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step by step. A molecule that behaves as a base accepts an H+ ion (or proton) from another molecule, whereas a molecule that behaves as an acid donates an H+ ion (or proton) to another molecule. Neither of these is the case for Cl in this reaction, which rules out (A) and (B). A catalyst is a substance that only accelerates a reaction without itself undergoing chemical change, which is not the case here. This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which they donate in the reaction to form 3 HClO2. This is the behavior of a reducing agent, or (D). The answer is (D).\n\nQ: Which of the following statements about the lanthanide elements is NOT true?\n(A) The most common oxidation state for the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate hydrogen. (D) The atomic radii of the lanthanide elements increase across the period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide elements in fact decrease across the period from La to Lu. Options (A), (B), and (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\nQ: Which of the following lists the hydrides of group-14 elements in order of thermal stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14 hydrides decreases as we move from the top of group 14 to the bottom. The order of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step by step. The electron paramagnetic resonance spectrum will be split by two forms of interactions. The first is the hyperfine interaction with the 13C (nuclear spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will be further split into 4 lines by the interaction with three equivalent 1H nuclei. The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E).", "college_computer_science": "The following are multiple choice questions (with answers) about college computer science.\n\nQ: Which of the following regular expressions is equivalent to (describes the same set of strings as) (a* + b)*(c + d)?\n(A) a*(c + d)+ b(c + d)\n(B) a*(c + d)* + b(c + d)*\n(C) a*(c + d)+ b*(c + d)\n(D) (a + b)*c +(a + b)*d\nA: Let's think step by step. We know that:\n1. (X* + Y)* = (X + Y)*\n2. X(Y + Z)? = XY + XZ\nUsing equation 1 we can rewrite (a* + b)*(c + d)? as:\n3. (a + b)*(c + d)?\nUsing equation 2 we can rewrite equation 3 as:\n(a + b)*c + (a + b)*d The answer is (D).\n\nQ: The Singleton design pattern is used to guarantee that only a single instance of a class may be instantiated. Which of the following is (are) true of this design pattern?\nI. The Singleton class has a static factory method to provide its instance.\nII. The Singleton class can be a subclass of another class.\nIII. The Singleton class has a private constructor.\n(A) I only\n(B) II only\n(C) III only\n(D) I, II, and III\nA: Let's think step by step. Statement I is a correct statement about a Singleton, because a Singleton restricts instantiation to a single, static method. Statement II is also correct, because there is no inherent restriction regarding the inheritance of a Singleton. Statement III is also correct, because a Singletons must be instantiated only once, so its constructor is made private to prevent any construction except via its static factory method.\nGiven these facts, statements I, II, and III are all correct. The answer is (D).\n\nQ: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . . . , R7 and supports the following operations:\nADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)\nMUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)\nAn operation normally takes one cycle; however, an operation takes two cycles if it produces a result required by the immediately following operation in an operation sequence.\nConsider the expression AB + ABC + BC, where variables A, B, C are located in registers R0, R1, R2. If the contents of these three registers must not be modified, what is the minimum number of clock cycles required for an operation sequence that computes the value of AB + ABC + BC?\n(A) 5 (B) 6 (C) 7 (D) 8\nA: Let's think step by step. First, we are given that A is in R0, B is in R1, and C is in R2.\nNext, we can see that we must compute three multiplies (AB, BC, and ABC) and two adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum of five clock cycles.\nNext, we can see that there is no way to avoid at least one pipeline stall when computing our final answer, because to compute our final sum we must wait at least one cycle for the results from the previous stage to be ready. Thus, our minimum number of cycles must be 6.\nWe can verify that we can create a solution that requires only six cycles as follows:\ncompute AB: MUL R0, R1, R3\ncompute BC: MUL R1, R2, R4\ncompute ABC: MUL R3, R4, R5\ncompute AB + BC: ADD R3, R4, R6\nSTALL\ncompute AB + ABC + BC: ADD R5, R6, R7\nSo there are 6 cycles. The answer is (B).\n\nQ: A compiler generates code for the following assignment statement.\nG := (A + B) * C - (D + E) * F\nThe target machine has a single accumulator and a single-address instruction set consisting of instructions load, store, add, subtract, and multiply. For the arithmetic operations, the left operand is taken from the accumulator and the result appears in the accumulator. The smallest possible number of instructions in the resulting code is\n(A) 5 (B) 6 (C) 7 (D) 9\nA: Let's think step by step. We can compute the final answer with the following sequence of operations:\n1. LOAD D  (accumulator = D)\n2. ADD E  (accumulator = D+E)\n3. MUL F  (accumulator = (D+E)*F)\n4. STORE X (X = (D+E)*F)\n5. LOAD A  (accumulator = A)\n6. ADD B  (accumulator = A+B)\n7. MUL C  (accumulator = (A+B)*C)\n8. SUB X  (accumulator = (A+B)*C - (D+E)*F)\n9. STORE G (G = (A+B)*C - (D+E)*F)\nThis sequence takes 9 instructions. The answer is (D).\n\nQ: Consider a computer design in which multiple processors, each with a private cache memory, share global memory using a single bus. This bus is the critical system resource. Each processor can execute one instruction every 500 nanoseconds as long as memory references are satisfied by its local cache. When a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds. During half of this additional delay, the bus is dedicated to serving the cache miss. During the other half, the processor cannot continue, but the bus is free to service requests from other processors. On average, each instruction requires 2 memory references. On average, cache misses occur on 1 percent of references. What proportion of the capacity of the bus would a single processor consume, ignoring delays due to competition from other processors?\n(A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27\nA: Let's think step by step. We know that each instruction requires two memory references per instruction, and that there is an average cache miss rate of one percent.\nThus a given processor has:\n(1 cache miss / 100 references) * (2 references / instruction) =\n(2 cache misses / 100 instructions), so:\nmisses_per_instruction = 1 cache miss / 50 instructions.\nNext, we know that each instruction requires 500 nanoseconds when there is no cache miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:\n50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:\ninstructions_per_ns = 50 instructions / 27000 nanoseconds.\nNow, we know that each cache miss locks the bus for half of the 2000 nanosecond cache miss delay, or 1000 nanoseconds, so:\nlock_ns_per_miss = 1000 nanoseconds / cache miss.\nThus we can see that on average a single processor will lock the bus for:\nlock_ns_per_miss * misses_per_instruction * instructions_per_ns =\n(1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer is (B).", "college_mathematics": "The following are multiple choice questions (with answers) about college mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST = T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x) + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x) + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11 (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\nA: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ: Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n(A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably. We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100 liters of liquid in the tank, containing $s$ grams of salt. So the density of salt in the tank is $s / 100$ grams per liter. The flow of water out of the tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\frac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that, for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then, for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100 / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D).", "college_medicine": "The following are multiple choice questions (with answers) about college medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Creatine supplementation is a dietary supplement that results in body mass gain. The answer is (B).\n\nQ: Which of the following is not a true statement?\n(A) Muscle glycogen is broken down enzymatically to glucose-1-phosphate (B) Elite endurance runners have a high proportion of Type I fibres in their leg muscles (C) Liver glycogen is important in the maintenance of the blood glucose concentration (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step and go over each choice: \n(A) \u201cMuscle glycogen is broken down enzymatically to glucose-1-phosphate\u201d: This is a correct statement.\n(B) \u201cElite endurance runners have a high proportion of Type I fibres in their leg muscles\u201d: This is a correct statement.\n(C) \u201cLiver glycogen is important in the maintenance of the blood glucose concentration\u201d: This is a correct statement. \n(D) \u201cInsulin promotes glucose uptake by all tissues in the body\u201d: This is not a correct statement, because insulin promotes glucose uptake by the liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues in the brain and red blood cells are not affected by insulin. The answer is (D).\n\nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70 atm, and the room temperature is 25\u00b0C. Which two variables will both increase the pressure of the system, if all other variables are held constant?\n(A) Increasing temperature, increasing moles of gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase the pressure of the system (P), then either n (number of moles of the gas) or T (temperature) have to increase. The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder is found that has X-linked recessive transmission. Which of the following statements is likely true regarding the pedigree of this disorder?\n(A) All descendants on the maternal side will have the disorder. (B) Females will be approximately twice as affected as males in this family. (C) All daughters of an affected male will be affected. (D) There will be equal distribution of males and females affected.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step. Let's recall first that females have two X chromosomes, while males have one X and one Y chromosome. This is an important fact we need to know before answering this question. \nBecause a male can only pass his only one X chromosome to a daughter, if he is affected by this rare genetic disorder, then we know for sure that he will pass this rare genetic disorder to all his future-born daughters. Therefore, \u201c(C): All daughters of an affected male will be affected\u201d is a correct statement. The answer is (C).\n\nQ: Glucose is transported into the muscle cell:\n(A) via protein transporters called GLUT4. (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Glucose (also known as the blood sugar) is the main sugar found in the human body. It is transported into the muscle cell via diffusion through protein transporters called GLUT4. The answer is (A).", "college_physics": "The following are multiple choice questions (with answers) about college physics.\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of\n(A) charge (B) mass (C) energy and momentum (D) lepton number\nA: Let's think step by step. Lepton number must be conserved, meaning the total number of leptons minus the number of antileptons. If a muon decays into an electron and a single neutrino, the total lepton number would go from one to two, violating lepton number conservation. The answer is (D).\n\nQ: One end of a Nichrome wire of length 2L and cross-sectional area A is attached to an end of another Nichrome wire of length L and cross- sectional area 2A. If the free end of the longer wire is at an electric potential of 8.0 volts, and the free end of the shorter wire is at an electric potential of 1.0 volt, the potential at the junction of the two wires is most nearly equal to\n(A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V\nA: Let's think step by step. This is a simple voltage divider problem, where the longer wire has a resistance four times that of the shorter end. So the voltage divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: For which of the following thermodynamic processes is the increase in the internal energy of an ideal gas equal to the heat added to the gas?\n(A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic\nA: Let's think step by step. Heat added to the gas can go into the gases internal energy or work done against an external force. However, if the volume of the gas container is constant, no work will be done (since work is pressure times change in volume). So, at constant volume, all of the heat goes into the internal energy. The answer is (B).", "computer_security": "The following are multiple choice questions (with answers) about computer security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628 bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function which takes an input and produces a 160-bit (20-byte) hash value, its message digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on your system \u2013 so that your system doesn\u2019t run correctly or you can no longer access specific data, or it may even ask for ransom in order to give your access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan\nA: Let's think step by step. The system is asking for trojans, which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities so they can be fixed, rather than exploited (C) Hacking into systems run by those whose ethics you disagree with (D) A slang term for rapid software development, e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities in an application, system, or organization's infrastructure that an attacker can use to exploit an individual or organization. They use this process to prevent cyberattacks and security breaches by lawfully hacking into the systems and looking for weak points. The answer is (B).\n\nQ: The ____________ is anything which your search engine cannot search.\n(A) Haunted web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step. The search engine searches on the Surface Web, which is the portion of the world wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond to an internet concept. The Deep Web is the part of the World Wide Web which is not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n(A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a read outside bounds of a buffer (D) a format string attack\nA: Let's think step by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL cryptographic software library. Heartbleed resulted from improper input validation (due to a missing bounds check) in the implementation of the TLS heartbeat extension. The vulnerability was classified as a buffer over-read, a situation where more data can be read than should be allowed. The answer is (C).", "conceptual_physics": "\nThe following are multiple choice questions (with answers) about conceptual physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different frequency (B) deflection (C) interference (D) polarization\nA: Let's think step by step. In a soap bubble film, the light bounces between the two soap-air interfaces many times, interfering with itself constructively or destructively depending on the width of the film. This results in different colors being visible. The answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission, the combined masses of the products after fission are\n(A) less (B) more (C) the same (D) zero\nA: Let's think step by step. Fission releases energy, which comes from the rest mass of its initial nucleus. Thus the mass of the products is less than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things that are equivalent according to the equivalence principle are\n(A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration. (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence principle states that gravity and acceleration are equivalent. The answer is (C).\n\nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear binding energy, the mass of an atomic nucleus is less than the sum of individual masses of the free constituent protons and neutrons; this is known as the mass defect. Hydrogen has no mass defect because it has only a single nucleon, so it will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane flies slower when flying into the wind and faster with wind at its back. When launched at right angles to the wind a cross wind its groundspeed compared with flying in still air is\n(A) the same (B) greater (C) less (D) either greater or less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed in the direction of the wind is greater than it would be in the absence of wind, and its direction orthogonal to the wind is the same as it would be in the absence of the wind. The total speed, which is these two components added in quadrature, is thus greater than the speed in still air. The answer is (B).", "econometrics": "The following are multiple choice questions (with answers) about econometrics.\n\nQ: Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bi-variate VAR, and the values of the determinant of the variance-covariance matrix of residuals are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What is the optimal model order according to Akaike's information criterion?\n(A) 1 lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. First of all, let\u2019s recall that for a given set of data, Akaike's information criterion (AIC) allows us to measure how well a statistical model fits the data; it is an estimator of prediction error. Here in this problem we will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike\u2019s criterion, where ln denotes the natural log function, det the determinant function, k the total number of parameters in total (across both equations), and T the number of observations (which, in this case, is equal to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags, it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500) = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) = ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084) + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the information criterion, the answer should be the one with the lowest value. In this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2 + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. If we have a an AR(1) model with the disturbances having zero mean and unit variance, then the unconditional mean of y is equal to the following:\nunconditional mean of y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\nunconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately 0.33. That means that the answer should be (D) 0.33. The answer is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored?\n(A) It will be biased (B) It will be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Heteroscedasticity refers to the condition where the variance of the error terms is not constant across multiple observations. If heteroscedasticity is present in a regression model, then the coefficient estimates in the OLS estimator will be not only unbiased and consistent but also inefficient. Because (A) and (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true?\n(i) If the size of the test were exactly 8%, we would be indifferent between rejecting and not rejecting the null hypothesis\n(ii) The null would be rejected if a 10% size of test were used\n(iii) The null would not be rejected if a 1% size of test were used\n(iv) The null would be rejected if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n(iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii) are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. This is a formal logic problem about stationally process. For a stationary autoregressive process, shocks will eventually die away. The answer is (A).", "electrical_engineering": "\nThe following are multiple choice questions (with answers) about electrical engineering.\n\nQ: A point pole has a strength of 4\u03c0 * 10^-4 weber. The force in newtons on a point pole of 4\u03c0 * 1.5 * 10^-4 weber placed at a distance of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's think step by step. The force between two point poles is given by m_1m_2/(mu_0 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in the question, we calculate that the force is approximately 15 N. The answer is (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's think step by step. The torque on a coil in a uniform magnetic field is given by BANI, where B is the magnetic flux density, A is the area of the coil, N is the number of turns, and I is the current. So we have that I = (Torque)/(BAN), or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset latch; in the case where S=1 and R=1, the circuit has no stable state; instead a race condition will be produced within the circuit, so the device will be in an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ: Two long parallel conductors carry 100 A. If the conductors are separated by 20 mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r), where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\nQ: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03a9 and total number of conductors is 100. Find the total resistance\n(A) 200\u03a9 (B) 100\u03a9 (C) 50\u03a9 (D) 10\u03a9\nA: Let's think step by step. In lap winding, effectively two resistors are connected in parallel, so the actual resistance of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50 Ohms. The answer is (C).", "elementary_mathematics": "The following are multiple choice questions (with answers) about elementary mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n(A) The 10th number in the pattern will be an even number.\n(B) The number pattern will never have two even numbers next to each other.\n(C) The next two numbers in the pattern will be an even number then an odd number.\n(D) If the number pattern started with an odd number then the pattern would have only odd numbers in it.\nA: Let's think step by step. Choice A is incorrect because every even-numbered term in the pattern is odd, and 10 is an even number. Choice B is correct, because adding an odd number (in this case 11) to an odd number produces an even number, and adding an odd number to an even number produces an odd number. Thus the terms in the pattern will alternate between odd and even, so there will never be two even numbers next to each other. Choice C is incorrect because the last term in the example is even (54), and we know that the terms will alternate between even and odd. Choice D is incorrect because the terms in the pattern will alternate between odd and even, regardless of the value of the first term. The answer is (B).\n\nQ: The population of the city where Michelle was born is 145,826. What is the value of the 5 in the number 145,826?\n(A) 5 thousands\n(B) 5 hundreds\n(C) 5 tens\n(D) 5 ones\nA: Let's think step by step. Choice A is correct, because there are three digits following the 5, so\nthe 5 is in the thousands place. Thus the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different colors of paint. They have 25 cans of each color in storage. The number of cans of paint the store has in storage can be found using the expression below. 107 \u00d7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B) 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107 x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n(C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\nA: Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n(B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5 x 4). The answer is (B).", "formal_logic": "The following are multiple choice questions (with answers) about formal logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the following sentence?\nTurtles live long lives and are happy creatures, unless they are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022 (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cL\u201d denote \u201cliving long\u201d, H \u201cbeing happy\u201d, and \u201cI\u201d \u201cbeing injured\u201d. Now, consider each choice:\n(A) means (living long AND being happy) is equivalent to (being injured). \n(B) means (living long AND being happy) OR (being injured). \n(C) means (living long) AND (being happy OR being injured). \n(D) means (living long) AND (being happy implies being R), but what R denotes is not clear.\nObviously, (B) is the best symbolization of the original sentence. The answer is (B).\n\nQ: Select the best translation into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. We are told that \u201cBxyx\u201d means \u201cx borrows y from z\u201d. We can rewrite \u201cGeorge borrows Hector's lawnmower\u201d as \u201cGeorge borrows a lawnmower from Hector\u201d, which can then be translated into predicate logic as \u201cBglh\u201d. The answer \u201cBglh\u201d appears in (C); therefore, (C) must be the correct answer. The answer is (C).\n\nQ: \nSelect the best English interpretation of the given arguments in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball. (B) Marina is a dancer. No weakling is a dancer. Everything is either a weakling or plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some weaklings are not dancers. Everything is either a weakling or plays volleyball. So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cD\u201d denote \u201cbeing a dancer\u201d, \u201cm\u201d denote \u201cMaria\u201d, \u201cg\u201d denote \u201cGeorgia\u201d, \u201cW\u201d denote \u201cweakling\u201d, \u201cA\u201d denote \u201cplaying volleyball\u201d. Then, we have the following:\n1. Dm \u2192 Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling, then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball. So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that some weaklings are not dancers, but the second argument strongly states that no weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important detail about Georgia playing volleyball. Option (D) has all the details presented in the arguments and is the best English interpretation of the arguments. The answer is (D).\n\nQ: Select the best translation into predicate logic: No people drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx) (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cP\u201d denote \u201cbeing on Mars\u201d and \u201cD\u201d denote \u201cdriving on Mars\u201d. Then let\u2019s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these options, Option (C) appears to be the best and most meaningful interpretation of the argument \u201cNo people drive on Mars.\u201d The answer is (C).", "global_facts": "The following are multiple choice questions (with answers) about global facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. According to data published by the World Health Organization, the nummber of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\nQ: As of 2019, about what percentage of Americans agree that the state is run for the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. In 2019, about 46% percentage of Americans agree that the state is run for the benefit of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage of Russians say it is very important to have free media in our country without government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, about 38% of Russians say it is very important to have free media in our country. The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.\n(A) increased, increased (B) increased, decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2015, since 1990 forests have increased in Europe and have decreased in Africa and the Americas. The answer is (B).\n\nQ: Which of the following pairs of statements are both true (as of 2019)?\n(A) People tend to be optimistic about their own future and the future of their nation or the world. (B) People tend to be optimistic about their own future but pessimistic about the future of their nation or the world. (C) People tend to be pessimistic about their own future but optimistic about the future of their nation or the world. (D) People tend to be pessimistic about their own future and the future of their nation or the world.\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, most people tend to be optimistic about their own future but pessimistic about the future of their nation or the world. The answer is (B).", "high_school_biology": "The following are multiple choice questions (with answers) about high school biology.\n\nQ: In animal cells, which of the following represents the most likely pathway that a secretory protein takes as it is synthesized in a cell?\n(A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the endoplasmic reticulum and moves from there to the Golgi apparatus, where it is modified and packaged into a vesicle. The vesicle then floats to the plasma membrane and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed a previously polar amino acid into a nonpolar amino acid. This amino acid was located at a site distant from the enzyme\u2019s active site. How might this mutation alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing the shape of the protein (D) An amino acid change away from the active site cannot alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A change in an amino acid leads to a change in the primary structure of the protein. A change in the primary structure may lead to a change in the secondary and the tertiary structure of the protein. A change in the tertiary structure means a change in the shape of the protein, so (C) has to be correct. Since the change does not affect the active site of the enzyme, we do not expect the activity of the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation\nA: Let's think step by step. The introduction of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique in molecular biology and scientific research. There are multiple ways foreign DNA can be introduced into cells including transformation, transduction, conjugation, and transfection. In contrast, (A) is not a way to form DNA: during translation the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous structures are often cited as evidence for the process of natural selection. All of the following are examples of homologous structures EXCEPT\n(A) the wings of a bird and the wings of a bat (B) the flippers of a whale and the arms of a man (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200b\u200bHomologous structures are similar physical features in organisms that share a common ancestor \u200b\u200bbut different functions. Comparisons (B) and (C) are clearly homologous because they share a common ancestor and the structures serve different purposes. Bat wings and birg wings are also homologous, while they are both wings, the forelimbs serve different purposes. Insects and dogs are very far ancestors since one is vertebrate while the other is invertebrate and the forelimbs serve the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of the following is not known to be involved in the control of cell division?\n(A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think step by step. Normal cells move through the cell cycle in a regulated way. At the checkpoint stage, they use information about their own internal state and cues from the environment around them to decide whether to proceed with cell division. Cues like these act by changing the activity of core cell cycle regulators inside the cell. The most common regulators are cyclins and cyclin-dependent kinases. Fibroblast cells do not play any role in cell division. The answer is (D).", "high_school_chemistry": "The following are multiple choice questions (with answers) about high school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n(A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride is a compound that is derived by removing water from an acid. The chemical formula for water is H2O, which means that we need to determine which of these options, when combined with H2O, forms an acid. SO2, or Sulfur dioxide, when combined with H2O, makes H2SO4, or sulfuric acid. The answer is (C).\n\nQ: Which of the following is expected to be a polar molecule?\n(A) PCl4F (B) BF3 (C) CO2 (D) Si(CH3)4\nA: Let's think step by step. A polar molecule is one that has a slightly positive charge on one end of the molecule and a slightly negative charge on the other end. Boron trifluoride (BF3) has Boron as the center atom and three fluorine atoms attached to it; it is trigonal planar and symmetric, so it is nonpolar. Carbon Dioxide (CO2) has Carbon as the central atom with double bonds to two Oxygen atoms - this is also symmetrical and therefore nonpolar. The same is the case for tetramethyl silane (SI(CH3)4), which is a Silicon atom surrounded by four methyl groups. The structure of PCL4F is that Phosphorus is the central atom, attached to four chlorines and one fluorine atom. This is asymmetrical, and therefore has a net dipole and is expected to be a polar molecule. The answer is (A).\n\nQ: From the solubility rules, which of the following is true?\n(A) All chlorides, bromides, and iodides are soluble (B) All sulfates are soluble (C) All hydroxides are soluble (D) All ammonium-containing compounds are soluble\nA: Let's think step by step. The chlorides, bromides, and iodides of lead, silver, and mercury are not soluble in water. This rules out (A). The sulfates of lead, barium, and calcium are not soluble in water, which rules out (B). The hydroxides of any metal besides sodium, potassium, ammonium, calcium, and barium are insoluble. This rules out (C). Typically ammonium ions indicate a soluble ionic substance. The answer is (D).\n\nQ: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?\n(A) 3.89 (B) 7.78 (C) 5.78 (D) 2.33\nA: Let's think step by step. Recall that $[A] = [H^{+}]$. Here, this is equal to $$10^{-3.89}$. Then we have $K_{a} = $\nrac{[H^{+}][A^{-}]}{[HA]} = \nrac{10^{-3.89} \\cdot 10^{-3.89}}{10^{-2}}. The resulting exponent is $-3.89 + (-3.89) - (-2) = 5.78$, therefore $K_a = 10^{-5.78}$. The $pK_a$ is the negative log of $K_a$, which is equal to $5.78$. The answer is (C).\n\nQ: A solution contains 2.00 mole of acetic acid, CH3COOH, and 1.00 mole of calcium acetate, Ca(CH3COO)2. The solution is able to resist the addition of a small amount of strong acid or strong base with only minor changes in the pH of the solution. Larger quantities of strong acid or strong base can cause a significant change in pH. How many moles of nitric acid, HNO3, may be added before the pH begins to change significantly?\n(A) 0.500 mole (B) 1.00 mole (C) 2.00 mole (D) 3.00 mole\nA: Let's think step by step. We would like to compute the buffer capacity of this solution. First we write the equation for the ionization of the weak acid, in this case of acetic acid. $CH_{3}COOH (aq) + H_{2}O \nightarrow H_{3}O^{+} + CH3COO^{-}$. The conjugate base is therefore the acetate ion. The added strong acid, Nitric acid, will react with the conjugate base. Therefore the maximum amount of acid that can be added will be equal to the amount of acetate ion, or 2 moles. The answer is (C).", "high_school_computer_science": "The following are multiple choice questions (with answers) about high school computer science.\n\nQ: Which of the following is an example of the use of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that it is about to hit an object. (B) A hiker uses a G P S watch to keep track of her position. (C) A refrigerator orders milk from an online delivery service when the milk in the refrigerator is almost gone. (D) A runner uses a watch with optical sensors to monitor his heart rate.\nA: Let's think step by step. The term Internet of Things (IoT) refers to common devices which are connected to the internet, enabling new functionality. Choice A is incorrect because it does not describe an internet connected device. In choice B, the watch is only described as having GPS functionality but no internet connectivity. Choice C describes a common device (a refrigerator) which has internet connectivity enabling new functionality (online ordering). Choice D does not mention internet connectivity for the watch, only optical sensors. The answer is (C).\n\nQ: Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?\n(A) The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator. (B) Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer. (C) A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session. (D) A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.\nA: Let's think step by step. Choice A is incorrect as it only describes network traffic, which an anonymous browser does not change. Choice B is correct as it correctly describes how an anonymous browser will prevent saving data on the user\u2019s computer after the session is ended. Choice C is incorrect because an anonymous browser will not prevent logging in to email or social media accounts. Choice D is incorrect because an anonymous browser in itself performs no virus protection. The answer is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n  DISPLAY (\"Hotel\")\n } ELSE {\n  IF (Y > 0){\n   DISPLAY (\"November\")\n  } ELSE {\n   DISPLAY (\"Yankee\")\n  }\n }\n}\nWhat is displayed as a result of running the program?\n(A) Foxtrot (B) Hotel (C) November (D) Yankee\nA: Let's think step by step. Because X has the value 5, the first conditional IF (X < 0) is false, so we move to the first ELSE clause. Because X is 5 and Y is 10, the second conditional IF (X > Y) is false, so we move to the following ELSE clause. Since Y is 10, the conditional IF (Y > 0) is true, so the command DISPLAY (\"November\") is executed. The answer is (C).\n\nQ: What is the output of \"abc\"[::-1] in Python 3?\n(A) Error (B) abc (C) cba (D) c\nA: Let's think step by step. We know that the slicing operator [::-1] takes all of the elements in the string in reverse order, so we reverse the order of the string \"abc\", resulting in \"cba\". The answer is (C).\n\nQ: A list of numbers has n elements, indexed from 1 to n. The following algorithm is intended to display the number of elements in the list that have a value greater than 100. The algorithm uses the variables count and position. Steps 3 and 4 are missing.\n Step 1: Set count to 0 and position to 1.\n Step 2: If the value of the element at index position is greater than 100, increase the value of count by 1.\n Step 3: (missing step)\n Step 4: (missing step)\n Step 5: Display the value of count.\nWhich of the following could be used to replace steps 3 and 4 so that the algorithm works as intended?\n(A) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of count is greater than 100.\n(B) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of position is greater than n.\n(C) Step 3: Repeat step 2 until the value of count is greater than 100.\n  Step 4: Increase the value of position by 1.\n(D) Step 3: Repeat step 2 until the value of position is greater than n.\n  Step 4: Increase the value of count by 1.\nA: Let's think step by step. Choice A is incorrect, because its Step 4 has an incorrect termination condition, stopping when count is greater than 100. We need to stop after inspecting all elements in the list. Choice B is correct because it correctly increments both count and position, and correctly repeats these steps and terminates when all elements in the list have been inspected. Choice C is incorrect because it incorrectly increments the variable count until its value is greater than 100, regardless of the elements in the list. Choice D is incorrect because its step 3 does not increment the value of position, so it will repeat forever. The answer is (B).", "high_school_european_history": "The following are multiple choice questions (with answers) about high school european history.\n\nQ: This question refers to the following information.\nAlbeit the king's Majesty justly and rightfully is and ought to be the supreme head of the Church of England, and so is recognized by the clergy of this realm in their convocations, yet nevertheless, for corroboration and confirmation thereof, and for increase of virtue in Christ's religion within this realm of England, and to repress and extirpate all errors, heresies, and other enormities and abuses heretofore used in the same, be it enacted, by authority of this present Parliament, that the king, our sovereign lord, his heirs and successors, kings of this realm, shall be taken, accepted, and reputed the only supreme head in earth of the Church of England, called Anglicans Ecclesia; and shall have and enjoy, annexed and united to the imperial crown of this realm, as well the title and style thereof, as all honors, dignities, preeminences, jurisdictions, privileges, authorities, immunities, profits, and commodities to the said dignity of the supreme head of the same Church belonging and appertaining; and that our said sovereign lord, his heirs and successors, kings of this realm, shall have full power and authority from time to time to visit, repress, redress, record, order, correct, restrain, and amend all such errors, heresies, abuses, offenses, contempts, and enormities, whatsoever they be, which by any manner of spiritual authority or jurisdiction ought or may lawfully be reformed, repressed, ordered, redressed, corrected, restrained, or amended, most to the pleasure of Almighty God, the increase of virtue in Christ's religion, and for the conservation of the peace, unity, and tranquility of this realm; any usage, foreign land, foreign authority, prescription, or any other thing or things to the contrary hereof notwithstanding.\nEnglish Parliament, Act of Supremacy, 1534\nFrom the passage, one may infer that the English Parliament wished to argue that the Act of Supremacy would\n(A) give the English king a new position of authority (B) give the position of head of the Church of England to Henry VIII alone and exclude his heirs (C) establish Calvinism as the one true theology in England (D) end various forms of corruption plaguing the Church in England\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Act of Supremacy states that it grants authority to the king \"to repress and extirpate all errors, heresies, and other enormities and abuses\", referring to the corruption in the Church of England. The answer is (D).\n\nQ: This question refers to the following information.\nRead the following excerpt.\nThe revolutionary seed had penetrated into every country and spread more or less. It was greatly developed under the r\u00e9gime of the military despotism of Bonaparte. His conquests displaced a number of laws, institutions, and customs; broke through bonds sacred among all nations, strong enough to resist time itself; which is more than can be said of certain benefits conferred by these innovators.\nThe monarchs will fulfil the duties imposed upon them by Him who, by entrusting them with power, has charged them to watch over the maintenance of justice, and the rights of all, to avoid the paths of error, and tread firmly in the way of truth. Placed beyond the passions which agitate society, it is in days of trial chiefly that they are called upon to despoil realities of their false appearances, and to show themselves as they are, fathers invested with the authority belonging by right to the heads of families, to prove that, in days of mourning, they know how to be just, wise, and therefore strong, and that they will not abandon the people whom they ought to govern to be the sport of factions, to error and its consequences, which must involve the loss of society.\nUnion between the monarchs is the basis of the policy which must now be followed to save society from total ruin. . . .\nLet them not confound concessions made to parties with the good they ought to do for their people, in modifying, according to their recognized needs, such branches of the administration as require it.\nLet them be just, but strong; beneficent, but strict.\nLet them maintain religious principles in all their purity, and not allow the faith to be attacked and morality interpreted according to the social contract or the visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\nWhich of the following was the greatest cause of the fears expressed by Metternich in the document above?\n(A) The ideas of personal liberty and nationalism conceived during the Enlightenment resulted in radical revolutions that could spread throughout Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions and shifted the European balance of power. (C) The power of monarchs had grown to the point where it needed to be checked by other powers within each nation or domination of civilians would occur. (D) The rising and falling economic cycle of the newly emerging capitalist economy could lead to civilian unrest that must be suppressed.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The fears of revolution in early 19th century Europe expressed by Klemens von Metternich, a conservative Austrian statesman, were a direct result of the age of Enlightenment, a period of European history where the absolute power of the monarchy was challenged with ideas of individual liberty and nationalism, leading to the French revolution and its effects all over Europe. The answer is (A).\n\nQ: This question refers to the following information.\nThe excerpts below are from the Navigation Acts of 1651.\n[A]fter the first day of December, one thousand six hundred fifty and one, and from thence forwards, no goods or commodities whatsoever of the growth, production or manufacture of Asia, Africa or America, or of any part thereof; or of any islands belonging to them, or which are described or laid down in the usual maps or cards of those places, as well of the English plantations as others, shall be imported or brought into this Commonwealth of England, or into Ireland, or any other lands, islands, plantations, or territories to this Commonwealth belonging, or in their possession, in any other ship or ships, vessel or vessels whatsoever, but only in such as do truly and without fraud belong only to the people of this Commonwealth, or the plantations thereof, as the proprietors or right owners thereof; and whereof the master and mariners are also of the people of this Commonwealth, under the penalty of the forfeiture and loss of all the goods that shall be imported contrary to this act, , , ,\n[N]o goods or commodities of the growth, production, or manufacture of Europe, or of any part thereof, shall after the first day of December, one thousand six hundred fifty and one, be imported or brought into this Commonwealth of England, or any other lands or territories to this Commonwealth belonging, or in their possession, in any ship or ships, vessel or vessels whatsoever, but in such as do truly and without fraud belong only to the people of this Commonwealth, and in no other, except only such foreign ships and vessels as do truly and properly belong to the people of that country or place, of which the said goods are the growth, production or manufacture.\nWhich of the following best describes the outcome of the Navigation Acts of 1651?\n(A) They served as a catalyst for the growth of English shipping and overseas trade, but did little to limit the prospects of the Dutch in the seventeenth century. (B) They brought about almost immediate hardships for the Dutch economy as their dominance of overseas trade quickly ended. (C) They were rescinded during the restoration of the Stuarts as they sought normal diplomatic relations with the Dutch so not as to need Parliament's financial support for war. (D) They led to nearly a century of recurrent war between England and the Netherlands, which would not end until after American independence.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Navigation Acts of 1651 helped English shipping by restricting the ability of ships from other European countries, especially the Dutch, to transport goods from colonies in Asia and Africa into England. The answer is (A).\n\nQ: This question refers to the following information.\nIn Russia there was nothing going on well, and [Souvarine] was in despair over the news he had received. His old companions were all turning to the politicians; the famous Nihilists who made Europe tremble-sons of village priests, of the lower middle class, of tradesmen-could not rise above the idea of national liberation, and seemed to believe that the world would be delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get out of it with their foolery.\"\nThen, lowering his voice still more, in a few bitter words he described his old dream of fraternity. He had renounced his rank and his fortune; he had gone among workmen, only in the hope of seeing at last the foundation of a new society of labour in common. All the sous in his pockets had long gone to the urchins of the settlement; he had been as tender as a brother with the colliers, smiling at their suspicion, winning them over by his quiet workmanlike ways and his dislike of chattering. But decidedly the fusion had not taken place.\nHis voice changed, his eyes grew bright, he fixed them on \u00e9tienne, directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles who have won the great lottery prize of a hundred thousand francs have gone off at once and invested it, declaring that they are going to live without doing anything! Yes, that is your idea, all of you French workmen; you want to unearth a treasure in order to devour it alone afterwards in some lazy, selfish corner. You may cry out as much as you like against the rich, you haven't got courage enough to give back to the poor the money that luck brings you. You will never be worthy of happiness as long as you own anything, and your hatred of the bourgeois proceeds solely from an angry desire to be bourgeois yourselves in their place.\"\n\u00e9mile Zola, French writer, Germinal, 1885\nThe passage displays the direct concern for the welfare of the working classes that was typically a part of which movement?\n(A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The modern Communist movement aims to establish a classless society based on communal ownership and distribution of property and means of production, thereby especially benefiting the working classes. The answer is (C).\n\nQ: This question refers to the following information.\nThe following excerpt is from a pamphlet.\nYou will do me the justice to remember, that I have always strenuously supported the Right of every man to his own opinion, however different that opinion might be to mine. He who denies to another this right, makes a slave of himself to his present opinion, because he precludes himself the right of changing it.\nThe most formidable weapon against errors of every kind is Reason. I have never used any other, and I trust I never shall.\nThe circumstance that has now taken place in France of the total abolition of the whole national order of priesthood, and of everything appertaining to compulsive systems of religion, and compulsive articles of faith, has not only precipitated my intention, but rendered a work of this kind exceedingly necessary, lest in the general wreck of superstition, of false systems of government, and false theology, we lose sight of morality, of humanity, and of the theology that is true.\nI believe in one God, and no more; and I hope for happiness beyond this life.\nI believe in the equality of man; and I believe that religious duties consist in doing justice, loving mercy, and endeavoring to make our fellow-creatures happy.\nI do not believe in the creed professed by the Jewish church, by the Roman church, by the Greek church, by the Turkish church, by the Protestant church, nor by any church that I know of. My own mind is my own church.\nAll national institutions of churches, whether Jewish, Christian or Turkish, appear to me no other than human inventions, set up to terrify and enslave mankind, and monopolize power and profit.\nI do not mean by this declaration to condemn those who believe otherwise; they have the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed a system of checks and balances for government to avoid abuses of power?\n(A) Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively against the monoplization of power and advocated for a system of checks and balances in government to prevent the rise of despotism. The answer is (B).", "high_school_geography": "The following are multiple choice questions (with answers) about high school geography.\n\nQ: Which one of the following items is an example of nonmaterial culture?\n(A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not physical objects. The answer is (C).\n\nQ: During the third stage of the demographic transition model, which of the following is true?\n(A) Birth rates increase and population growth rate is less rapid. (B) Birth rates decline and population growth rate is less rapid. (C) Birth rates increase and population growth rate increases. (D) Birth rates decrease and population growth rate increases.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The demographic transition model models the five different stages of population growth as a country goes through economic development, where the third stage refers to a period of declining birth rates and lower population growth. The answer is (B).\n\nQ: The practice of hiring a foreign third-party service provider to run an operation is called\n(A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. \"Offshoring\" literally means to move or base some of the activities or processes of a company to a foreign country. The answer is (B).\n\nQ: Which of the following statements is NOT accurate regarding the services provided by local governments in the United States?\n(A) Duplication of efforts occurs often. (B) Social problems of the central city spill over into the surrounding residential suburbs. (C) Inefficiency in providing services occurs often. (D) One neighborhood's efforts to reduce pollution are always supported by neighboring communities.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. There may be economic, social or political reasons for two neighboring communities and their local governments not agreeing to pollution reduction efforts initiated by one of them. The answer is (D).\n\nQ: The rate of natural increase of a population is found by subtracting the\n(A) crude death rate from the crude birth date. (B) crude birth rate from the crude death rate. (C) doubling time from the crude birth rate. (D) fertility rate from the crude death rate.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The difference between number of births and deaths gives the population increase at any given time. The answer is (A).", "high_school_government_and_politics": "The following are multiple choice questions (with answers) about high school government and politics.\n\nQ: Which of the following best states an argument made by James Madison in The Federalist number 10?\n(A) Honest politicians can prevent factions from developing. (B) Factions are more likely to occur in large republics than in small ones. (C) The negative effects of factionalism can be reduced by a republican government. (D) Free elections are the people's best defense against factionalism.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the Federalist number 10, James Madison advocated for a representative republican form of government to guard against factionalism. The answer is (C).\n\nQ: The term \"budget deficit\" refers to the\n(A) annual increase in federal spending on the military (B) amount of interest on the national debt (C) difference between the initial budget proposals made by the president and Congress (D) amount the government spends in excess of its revenues\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. When the goverment spends more than it earns, their difference is the budget deficit. The answer is (D).\n\nQ: Which of the following statements about cabinet departments is FALSE?\n(A) They are established by the legislative branch. (B) Their members often don't have much influence over presidential decisions. (C) They cannot all be run by leaders who belong to the same political party the president does. (D) Not every federal agency is a cabinet department.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. There is no law stipulating that some cabinet department leaders have to belong to a political party different from that of the president. The answer is (C).\n\nQ: Which of the following cases established the precedent that a defendant must be informed of the right to remain silent, the right to a lawyer, and protection from self-incrimination?\n(A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant's right to an attorney and protection from self-incrimination. The answer is (D).\n\nQ: Uncertainty over the limits to presidential power is caused primarily by the fact that\n(A) the constitutional definition of those powers is broad and unspecific (B) most people agree that the Constitution places too many limits on presidential power (C) the Supreme Court consistently refuses to rule on cases concerning presidential powers (D) constitutional amendments have greatly increased presidential powers\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. The US Constitution is not very specific about the powers of the president, leading to uncertainty over its limits. The answer is (A).", "high_school_macroeconomics": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\nQ: Which of the following policies best describes supply-side fiscal policy?\n(A) An increase in the money supply (B) Increased government spending (C) Lower taxes on research and development of new technology (D) Higher taxes on household income\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Supply-side fiscal policy stimulates the economy by encouraging more production of goods and services through reduction in taxes and deregulation. The answer is (C).\n\nQ: The short-run Phillips curve indicates a\n(A) direct relation between unemployment and inflation (B) direct relation between price and quantity demanded (C) inverse relation between price and quantity demanded (D) inverse relation between unemployment and inflation\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The short-run Phillips curve shows that whenever unemployment decreases below a natural level, the inflation starts increasing, and vice-versa. The answer is (D).\n\nQ: Holding all else equal which of the following monetary policies would be used to boost U.S. exports?\n(A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government securities (D) Lowering tariffs\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Buying government securities leads to reduction in demand for US dollars from foreign buyers, thereby making it cheaper and hence making US exports more attractive. The answer is (C).\n\nQ: A federal deficit occurs when\n(A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections exceed spending. (D) federal spending exceeds federal tax revenues.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. A federal deficit occurs when federal spending exceeds federal income which is primarily from tax revenues. The answer is (D).\n\nQ: Which of the following is not included in the U.S. GDP?\n(A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel. (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An American pop singer performs a sold-out concert in Paris. (D) A French theatrical production tours dozens of American cities.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The economic transactions related to the performance of the American pop-singer in Paris happens entirely outside the U.S. and hence is not included in the GDP numbers. The answer is (C).", "high_school_mathematics": "The following are multiple choice questions (with answers) about high school mathematics.\n\nQ: Simplify and write the result with a rational denominator: $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step. Factoring $729=3^6$ and combining the roots $\\frac{1}{2}\\frac{1}{3}\\frac{1}{2}=\\frac{1}{12}$, we get that $\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}=\\left(\\frac{1}{3^6}\\right)^{\\frac{1}{12}}=\\frac{1}{3^{\\frac{1}{2}}}=\\frac{3}{\\sqrt{3}}$ The answer is (D).\n\nQ: Five thousand dollars compounded annually at an $x\\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\\$300$ to grow to $\\$9600$?\n(A) 12 (B) 1 (C) 30 (D) 5\nA: Let's think step by step. To go from $\\$300$ to $\\$9600$, the value must go up by a factor of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\nQ: Ten students take a biology test and receive the following scores: 45, 55, 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n(A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85 = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies directly as the square of $y$, and $y$ varies directly as the cube of $z$. If $x$ equals $-16$ when $z$ equals 2, what is the value of $x$ when $z$ equals $\\frac{1}{2}$?\n(A) -1 (B) 16 (C) -\\frac{1}{256} (D) \\frac{1}{16}\nA: Let's think step by step. We know that $x \\propto y^2$ and $y \\propto z^3$, so $x = k z^6$ for some constant $k$. Plugging in for $x=-16$ and $z=2$, the constant value is $k=\\frac{x}{z^6}=\\frac{-16}{64}=-\\frac{1}{4}$. So, when $z=\\frac{1}{2}$, the value of $x$ is $x=kz^6=-\\frac{1}{4}\\frac{1}{2^6}=-\\frac{1}{256}$. The answer is (C).\n\nQ: Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights will come on at the same time $2*7+1=15$ times. The answer is (B).", "high_school_microeconomics": "The following are multiple choice questions (with answers) about high school microeconomics.\n\nQ: Which of the following is necessarily a characteristic of oligopoly?\n(A) Free entry into and exit from the market (B) A few large producers (C) One producer of a good with no close substitutes (D) A homogenous product\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An oligopoly is when a market is dominated by just one or a few number of sellers or producers. To get oligopoly, the market should have high barriers to new entry, and the product has differentiation. The answer is (B).\n\nQ: If the government subsidizes producers in a perfectly competitive market, then\n(A) the demand for the product will increase (B) the demand for the product will decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. (A) and (B) are wrong because the demand curve does not change at all. If the government subsidizes producers, the supply will increase, and thus the consumer surplus also increases. The answer is (C).\n\nQ: Which of the following is true of a price floor?\n(A) The price floor shifts the demand curve to the left. (B) An effective floor creates a shortage of the good. (C) The price floor shifts the supply curve of the good to the right. (D) To be an effective floor, it must be set above the equilibrium price.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. Price floor does not shift the demand or shift curve. An effective price floor should be set above the equilibrium price, otherwise the market bears and the floor does not have effective effect. The answer is (D).\n\nQ: The concentration ratio for a monopoly is\n(A) 0 (B) 5 (C) 10 (D) 100\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. The concentration ratio is calculated as the sum of market share of a specific number of largest companies. Monopoly means one company or entity controls the entire market, therefore, the concentration ratio is 100 percent. The answer is (D).\n\nQ: In a competitive labor market for housepainters, which of the following would increase the demand for housepainters?\n(A) An effective minimum wage imposed on this labor market. (B) An increase in the price of gallons of paint. (C) An increase in the construction of new houses. (D) An increase in the price of mechanical painters so long as the output effect exceeds the substitution effect.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An increase in the construction of new houses means an increase demand of in-house painting, thus increases the demand for housepainters. The answer is (C).", "high_school_physics": "The following are multiple choice questions (with answers) about high school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?\n(A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy usage is known as power; in an dissipative electrical circuit, power is given by voltage times current. So in our case, the power is 120 V times 2 amps, or 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin. How much work is required to move a charge, Q = +8 \u00b5C, from the point (0, 4 meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\nA: Let's think step by step. To calculate the work required to move a charge from one location to another in a fixed electric field, it is enough to calculate the potential difference between the two locations. Here, the potential only depends on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following conditions will ensure that angular momentum is conserved? I. Conservation of linear momentum II. Zero net external force III. Zero net external torque\n(A) I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think step by step. Torque is defined as the change in angular momentum; if there is zero external torque, angular momentum is conserved. The answer is (D).\n\nQ: A photocell of work function \u03d5 = 2eV is connected to a resistor in series. Light of frequency f = 1 \u00d7 10^15 Hz hits a metal plate of the photocell. If the power of the light is P = 100 W, what is the current through the resistor?\n(A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The only answer above which has units of current is D, 24 A. The answer is (D).\n\nQ: A pipe full of air is closed at one end. A standing wave is produced in the pipe, causing the pipe to sound a note. Which of the following is a correct statement about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure is at a node, but the particle displacement is at an antinode. (B) The pressure is at an antinode, but the particle displacement is at a node. (C) The pressure and the particle displacement are both at nodes. (D) The pressure and the particle displacement are both at antinodes.\nA: Let's think step by step. At the closed end of the pipe, the particles cannot have any net displacement because the pipe closure stops them. So the particle displacement is at a node. This closure also causes the pressure to be maximal, i.e. an antinode. The answer is (B).", "high_school_psychology": "The following are multiple choice questions (with answers) about high school psychology.\n\nQ: Pascale is interested in the processing strategies children use to learn new information. Pascale would best be classified as what type of psychologist?\n(A) sociocultural (B) clinical (C) cognitive (D) behaviorist\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Sociocultural psychologist focuses on the effect of societal factors on people. Clinical psychologist focuses on people with mental issues. Cognitive psychologist focuses on how people think and learn, including the processing strategies. Behaviorist focuses more on the environment and experience effect on people. The answer is (C).\n\nQ: According to Caplan's model of consultee-centered case consultation, the consultant is primarily interested in\n(A) identifying the causes and solutions of the client's presenting problems (B) identifying and eliminating the causes of the consultee's difficulties in handling a problem (C) establishing a hierarchy of authority to enable effective decision making (D) presenting a single, well-defined and unambiguous course of action for the consultant to overcome skills deficits\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Caplan defines two type of consultation. Client-centered case consultation aims to handle client's problems, while consultee-centered case consultation aims to identify the reason of client's difficulty to solve problems. The answer is (B).\n\nQ: According to the Individuals with Disabilities Education Improvement Act, which of the following must an educational agency do before it changes the educational placement of a student with a disability?\n(A) Give the child a trial period in the new environment (B) Notify the parents in writing (C) Obtain school board approval (D) Obtain parental consent\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. When the decision to change the educational placement of a student with a disability is made, the educational agency must notify the parents in writing on that date. The answer is (B).\n\nQ: While swimming in the ocean, Ivan is frightened by a dark shadow in the water even before he has the chance to identify what the shadow is. The synaptic connections taking place during this incident of fright are best described by which of the following?\n(A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are sent from the thalamus to the \"what\" and \"where\" pathways. (C) Messages are sent from the parasympathetic nervous system to the cerebral cortex. (D) Messages are sent from the frontal lobes to the pituitary gland.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Our neural system has a mechanism that can respond immediate emotional signal before going to the thought center. In the Ivan's case, messages travel directly from thalamus to amygdala. The answer is (A).\n\nQ: Ani believes that her attitudes and behavior play a central role in what happens to her. Such a belief is likely to be associated with\n(A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal locus of control.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. People with an external locus of control believes fate and luck play an important role in their lives, while people with an internal locus of control believes they control their lives. The answer is (D).", "high_school_statistics": "The following are multiple choice questions (with answers) about high school statistics.\n\nQ: A new smartwatch is manufactured in one part of a factory, then secured for shipping in another, independent part of the factory. The weight of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging would have the following mean and standard deviation:\n(A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation 3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard deviation 6.1 grams\nA: Let's think step by step. Since the weight of the watch and the weight of the packaging are independent random variables, the mean and variance of their sum is equal to the sum of their individual means and variances. So the mean is 62 + 456 = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation of 6.1 grams. The answer is (C).\n\nQ: After a frost warning was issued, the owner of a large orange grove asked his workers to spray all his trees with water. The water was supposed to freeze and form a protective covering of ice around the orange blossom. Nevertheless, the owner suspected that some trees suffered considerable damage due to the frost. To estimate the proportion of trees that suffered more than 50 percent damage due to the frost, he took a random sample of 100 trees from his grove. What is the response variable in this experiment?\n(A) The proportion of trees that suffered more than 50 percent damage due to frost. (B) The number of trees affected by the frost. (C) The number of trees sampled from the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage or at most 50 percent damage.\nA: Let's think step by step. In this experiment, the response variable is what is measured. For each tree, what is measured is whether or not it suffered more than 50 percent damage due to the frost. The answer is (D).\n\nQ: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What are the expected value and variance of the random variable X + Y?\n(A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X + Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.\nA: Let's think step by step. While means of sums of random variables add (regardless of whether the variables are independent) in order to determine the variance of a sum of random variables, we need to know not just their individual variances but the covariance of the two variables, which is not given in this problem. The answer is (D).\n\nQ: Which of the following sets has the smallest standard deviation? Which has the largest?\nI: {1,2,3}\nII: {-10,10}\nIII: {100}\n(A) I, II (B) II, III (C) III, I (D) III, II\nA: Let's think step by step. The variance of distribution I is the expected squared deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution II is 10^2 (because both elements are 10 away from the mean of zero). The variance of distribution III is 0, since it has a single entry. So distribution III has the smallest standard deviation and distribution II has the largest. The answer is (D).\n\nQ: Which of the following is a correct statement about correlation?\n(A) If the slope of the regression line is exactly 1, then the correlation is exactly 1. (B) If the correlation is 0, then the slope of the regression line is undefined. (C) Switching which variable is called x and which is called y changes the sign of the correlation. (D) The correlation r is equal to the slope of the regression line when z-scores for the y-variable are plotted against z-scores for the x-variable.\nA: Let's think step by step. Statement A is false because the slope of the regression line being exactly 1 can occur even when the two variables are not perfectly correlated. Statement B is false because uncorrelated variables regression lines can have slope zero. Statement C is false because correlation is symmetric in the two random variables. The answer is (D).", "high_school_us_history": "The following are multiple choice questions (with answers) about high school us history.\n\nQ: This question refers to the following information.\nI come not to urge personal claims, nor to seek individual benefits; I appear as the advocate of those who cannot plead their own cause; I come as the friend of those who are deserted, oppressed, and desolate. In the Providence of God, I am the voice of the maniac whose piercing cries from the dreary dungeons of your jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses. I am the Revelation of hundreds of wailing, suffering creatures, hidden in your private dwellings, and in pens and cabins\u2014shut out, cut off from all healing influences, from all mind-restoring cares.\u2026 Could their melancholy histories be spread before you as revealed to my grieved spirit during the last three months, how promptly, how earnestly would you search out the most approved means of relief; how trifling, how insignificant, by comparison, would appear the sacrifices you are asked to make; how would a few dimes and dollars, gathered from each citizen, diminish in value as a possession, compared with the certain benefits and vast good to be secured for the suffering insane...by the consecration and application of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\nDorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C) Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Both Dorothea Dix and Clara barton are American nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation & a great diversity of Sentiments in this and neighboring Governments; some vindicating & others condemning it; some charitably alleviating the Crime, & others maliciously painting it in the most odious & detestable Colours, we think it our duty to lay before the Publick, the whole Matter as it appeared, & still appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an utter insensibility to human Distresses, let us consider a few more recent Facts. When we found the last Summer that we were likely to get no Assistance from the Government, some Volunteers went out at our own Expense, determined to drive our Enemies from our Borders; & when we came near to the great Island, we understood that a Number of their Warriors had gone out against our Frontiers. Upon this we returned and came up with them and fought with them at the Munfey Hill where we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers from this Story in another Expedition. But no sooner had we destroyed their Provisions on the great Island, & ruined their trade with the good People at Bethlehem, but these very Indians, who were justly suspected of having murdered our Friends in Northampton County, were by the Influence of some Quakers taken under the Protection of the Government to screen them from the Resentments of the Friends and Relations of the Murdered, & to support them thro the Winter.\"\n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this context should be read as an explanation, not an admission of guilt or regret.\nThe sentiments expressed in the explanation above reflect which of the ongoing tensions during the colonial period of American history?\n(A) Tensions between British policies and the aspirations of North American colonists. (B) Tensions between American Indians allied with the French and those allied with the British. (C) Tensions between freed African Americans and white planters. (D) Tensions between backcountry settlers and elites within colonial America.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. After the French and Indian War, the Scotch-Irish settlers attacked American Indians. After the attacks on the Conestoga, about 250 Paxton Boys present their grievances to the Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited resentiment at local elites. The answer is (D).\n\nQ: This question refers to the following information.\nOur leaders talk about stopping aggression from the north, but this was a struggle among groups of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill them and demolish their country to do it. As the native people survey bombed-out villages, women and children burned by napalm, rice crops destroyed and cities overrun with our military personnel, they are doubtless saying secretly of the Vietcong guerillas and of the American forces, \"A plague on both your houses.\" \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps, and confine our military action to holding operations on the ground. Bombing the north has failed to halt or seriously check the flow of troops to the south and may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following opinions from the 1960s most directly reflects the perspective of George McGovern's speech?\n(A) Americans must maximize their technological edge in Vietnam. (B) American bombing in Vietnam is step by step leading to progress in the war. (C) American bombing in Vietnam is a failure. (D) America must not give in to defeatism about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. \"Stop the bombing\" and \"Bombing the north has failed to halt or seriously check the flow of troops to the south\" indicate that the perspective of George McGovern's speech is that Amerian bombing in Vietnam is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\"In the new Code of Laws which I suppose it will be necessary for you to make I desire you would Remember the Ladies, and be more generous and favorable to them than your ancestors. Do not put such unlimited power into the hands of the Husbands. Remember all Men would be tyrants if they could. If particular care and attention is not paid to the Ladies we are determined to foment a Rebellion, and will not hold ourselves bound by any Laws in which we have no voice, or Representation.\"\nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman has placed us in a most anomalous position. Women invested with the rights of citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary line, are subjects in the next. In some States, a married woman may hold property and transact business in her own name; in others, her earnings belong to her husband. In some States, a woman may testify against her husband, sue and be sued in the courts; in others, she has no redress in case of damage to person, property, or character. In case of divorce on account of adultery in the husband, the innocent wife is held to possess no right to children or property, unless by special decree of the court. But in no State of the Union has the wife the right to her own person, or to any part of the joint earnings of the co-partnership during the life of her husband. In some States women may enter the law schools and practice in the courts; in others they are forbidden. In some universities girls enjoy equal educational advantages with boys, while many of the proudest institutions in the land deny them admittance, though the sons of China, Japan and Africa are welcomed there. But the privileges already granted in the several States are by no means secure.\"\nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments expressed in the second excerpt by Susan B. Anthony are most likely in support of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D) prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. The above information mentioned that women are in an anomalous position in terms of legislation. Women's earnings do not belong to themselves, or they cannot testify against her husbands. Susan believes women should have equal legal rights as men. The answer is (B).\n\nQ: This question refers to the following information.\n\"Society in every state is a blessing, but government even in its best state is but a necessary evil; in its worst state an intolerable one; for when we suffer, or are exposed to the same miseries by a government, which we might expect in a country without government, our calamity is heightened by reflecting that we furnish the means by which we suffer. Government, like dress, is the badge of lost innocence; the palaces of kings are built on the ruins of the bowers of paradise. For were the impulses of conscience clear, uniform, and irresistibly obeyed, man would need no other lawgiver; but that not being the case, he finds it necessary to surrender up a part of his property to furnish means for the protection of the rest; and this he is induced to do by the same prudence which in every other case advises him out of two evils to choose the least. Wherefore, security being the true design and end of government, it unanswerably follows that whatever form thereof appears most likely to ensure it to us, with the least expense and greatest benefit, is preferable to all others.\"\nThomas Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Anti-Federalists do not believe centralized government power, and suspect Washington's military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion happen before the Revolution and they can be ruled out. The answer is (C).", "high_school_world_history": "The following are multiple choice questions (with answers) about high school world history.\n\nQ: This question refers to the following information.\n\"At least one of the [world's] societies would have to somehow enormously increase its productivity [in order to achieve global hegemony]. That quantum jump would have to be made before the various scientific, technological, agricultural, and industrial revolutions on which our post-quantum-leap world rests. It could only be accomplished by exploiting the ecosystems, mineral resources, and human assets of whole continents outside the lands of the society making the jump. Western Europe did just that by means of its brutality and guns and, more important, by geographical and ecological luck.\"\nCopyright \u00a9 2015 Cambridge University Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\" mentioned in the passage most directly contributed to which of the following developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes through the collapse of the established state structure (B) An increase in the population of the world through more plentiful supplies of food (C) The spread of Chinese and Indian belief systems across the world (D) An increase in social unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The \"quantum jump\" mentioned in the passage refers to the conquest of the New World and the Columbian Exchange. Choice (A) and (C) did not happen in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This question refers to the following information.\n\"The struggle against neo-colonialism is not aimed at excluding the capital of the developed world from operating in less developed countries. It is aimed at preventing the financial power of the developed countries being used in such a way as to impoverish the less developed.\nNon-alignment, as practiced by Ghana and many other countries, is based on co-operation with all States whether they be capitalist, socialist or have a mixed economy. Such a policy, therefore, involves foreign investment from capitalist countries, but it must be invested in accordance with a national plan drawn up by the government of the non-aligned State with its own interests in mind. The issue is not what return the foreign investor receives on his investments\u2026The question is one of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage expresses a point that the successful fight against neo-colonialism were in danger and the newly independent nations like Ghana may be re-colonized via financial power of the developed countries. The answer is (B).\n\nQ: This question refers to the following information.\n\"Indeed, as both the fatwas of distinguished [scholars] who base their opinion on reason and tradition alike and the consensus of the Sunni community agree that the ancient obligation of extirpation, extermination, and expulsion of evil innovation must be the aim of our exalted aspiration, for \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces evil innovation into our order must be expelled\" and \"Whosoever does aught against our order must be expelled,\" action has become necessary and exigent\u2026\"\nLetter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from Selim I is most clearly an example of which of the following?\n(A) The maintenance of military supremacy at all costs (B) Expanding tensions between religious sects (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage is an example of expanding tensions between Selim and Ismail. In the passage the Selim references the fatwa and the consensus of the Sunni community to against whosoever introduces evil. The answer is (B).\n\nQ: This question refers to the following information.\n\"The real grievance of the worker is the insecurity of his existence; he is not sure that he will always have work, he is not sure that he will always be healthy, and he foresees that he will one day be old and unfit to work. If he falls into poverty, even if only through a prolonged illness, he is then completely helpless, exam_ins to his own devices, and society does not currently recognize any real obligation towards him beyond the usual help for the poor, even if he has been working all the time ever so faithfully and diligently. The usual help for the poor, however, leaves a lot to be desired, especially in large cities, where it is very much worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely made this speech in reaction to which of the following issues?\n(A) Social acceptance of child labor (B) Declining life expectancy in Germany (C) Criticisms of German trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage talks about the grievance of the work under the industrial capitalism. The answer is (D).\n\nQ: This question refers to the following information.\nHe contains all works and desires and all perfumes and all tastes. He enfolds the whole universe and in silence is loving to all. This is the Spirit that is in my heart, this is Brahman. To him I shall come when I go beyond this life, and to him will come he who has faith and doubts not.\n\u2014The Upanishads, India, c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. Brahman refers to the ultimate reality of all things in the Hindu religion. In contrast, Buddhism does not have a concept of supreme God. The answer is (A).", "human_aging": "The following are multiple choice questions (with answers) about human aging.\n\nQ: All other things being equal, which of the following persons is more likely to show osteoporosis?\n(A) An older Hispanic American woman (B) An older African American woman (C) An older Asian American woman (D) An older Native American woman\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Although osteoporosis can occur at any age, the risk is higher for older people. It is most common in Asian and non-Hispanic white women. The answer is (C).\n\nQ: The finding that adults tend to remember events from their adolescence better than from other periods in their lives is referred to as the\n(A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic retrieval spike\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Reminiscence bump is a phenomenon that older adults tend to recollect events during their young ages. People usually have a period of childhood amnesia from birth to around age 5, and a reminiscence bump between 10 and 30. The answer is (B).\n\nQ: Which element in tobacco smoke is responsible for cancers?\n(A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).\n\nQ: When older adults move to a new state after retirement, which of the following is the more likely destination?\n(A) Texas (B) California (C) Hawaii (D) Vermont\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Texas does not have state tax, and has low cost of living compared with the other three options. The answer is (A).", "human_sexuality": "The following are multiple choice questions (with answers) about human sexuality.\n\nQ: The most common disorder among men who seek sexual therapy is:\n(A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory disorder\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The most common type of arousal disorder for men is erectile dysfunction, meaning that a person is not able to get any physical satisfaction from sexual activity although he may be interested in it. The answer is (C).\n\nQ: A woman who knows she has active herpes and untreated syphilis but continues to have sex without informing her partners of her condition has, in psychoanalytic terms:\n(A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. A person with weak superego tends to be delinquent, criminal or have antisocial personality. The action of the woman who knows she has active venereal disease but still have sex with her partners indicate she may has antisocial personality. The answer is (B).\n\nQ: The nature of homosexual activities that occur during preadolescence include all but which of the following?\n(A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other's genitals\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Sexual intercourse is prohibited during preadolescence. The answer is (A).\n\nQ: Women's ability to have multiple orgasms is primarily due to:\n(A) the fact that they do not have a refractory period. (B) the response of the inner layers of the vagina. (C) having alternating orgasms in different locations. (D) the G-Spot.\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The refractory period is the time when a person is not able to be erect or is not interested in sex. The answer is (A).\n\nQ: Morning sickness is typically a problem:\n(A) during the first trimester (B) during the second trimester (C) during the third trimester (D) all through the pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Morning sickness usually begins by nine weeks after conception, corresponding to the first trimester. The answer is (A).", "international_law": "The following are multiple choice questions (with answers) about international law.\n\nQ: How the consent to be bound of a State may be expressed?\n(A) The consent of a State to be bound is expressed only by ratification (B) The consent of a state to be bound by a treaty may be expressed by signature, ratification, acceptance, approval or accession (C) The consent of a State to be bound is expressed by signature (D) The consent of a State to be bound is expressed by whatever means they choose\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969 states that \"the consent of a State to be bound by a treaty may be expressed by signature, exchange of instruments constituting a treaty, ratification, acceptance, approval or accession, or by any other means if so agreed.\" (B) is the most precise and accurate answer. The answer is (B).\n\nQ: What is the judge ad hoc?\n(A) If a party to a contentious case before the ICJ does not have a national sitting as judge, it is entitled to nominate someone as a judge solely for that case, with the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified or passes away (D) Judge ad hoc is the judge that each party will always nominate in every contentious case\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. As \"ad hoc\" implies, a judge ad hoc is appointed only for a specific case or period, when a party to a contentious case before the International Court of Justice does not have a regular national sitting as judge. The answer is (A).\n\nQ: When 'consent' can serve as a circumstance precluding the wrongfulness of a State conduct?\n(A) Consent can serve as a circumstance precluding the wrongfulness whenever it is given (B) Consent can never serve as a circumstance precluding wrongfulness (C) Consent can serve as a circumstance precluding wrongfulness, provided the consent is valid and to the extent that the conduct remains within the limits of the consent given (D) Consent can always serve as a circumstance precluding wrongfulness, no matter which organ of the State gives it\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Valid consent can serve as a circumstance precluding the wrongfulness of a State conduct if the conduct remains within the limits of that consent, according to Chapter V of the Responsibility of States for Internationally Wrongful Acts, 2001, United Nations. The answer is (C).\n\nQ: Would a reservation to the definition of torture in the ICCPR be acceptable in contemporary practice?\n(A) This is an acceptable reservation if the reserving country's legislation employs a different definition (B) This is an unacceptable reservation because it contravenes the object and purpose of the ICCPR (C) This is an unacceptable reservation because the definition of torture in the ICCPR is consistent with customary international law (D) This is an acceptable reservation because under general international law States have the right to enter reservations to treaties\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable reservation in contemporary practice. The answer is (B).\n\nQ: What types of force does Article 2(4) of the UN Charter prohibit?\n(A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types of force, including sanctions (C) Article 2(4) encompasses all interference in the domestic affairs of States (D) Article 2(4) encompasses force directed only against a State's territorial integrity\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 2(4) of the UN Charter prohibits states from using armed forces in their international relations. The answer is (A).", "jurisprudence": "The following are multiple choice questions (with answers) about jurisprudence.\n\nQ: Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally fine self winding Rolox watch which we will sell to you at a very favorable price.'\n(A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C) The letter contains a valid offer which will terminate within a reasonable time. (D) The letter lacks one of the essential elements of an offer.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. An offer shows the intent to enter into a mutually-beneficial contract with specific terms. An offer can be made by a letter. While this letter indicates the willingness to sell, the lack of specific terms, such as transaction price and offer expiration date, makes it an incomplete offer. The answer is (D).\n\nQ: Functions of the law include all but which of the following?\n(A) maximizing individual freedom (B) providing a basis for compromise (C) keeping the peace (D) promoting the principles of the free enterprise system\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Laws are fundamentally about helping resolve disputes between individuals, and therefore essential for maximizing individual freedom, providing a basis for compromise, and keeping the peace. The answer is (D).\n\nQ: The ________ School of jurisprudence postulates that the law is based on what is \"correct.\"\n(A) Natural Law (B) Analytical (C) Historical (D) Sociological\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Natural Law School of jurisprudence focuses on the laws of nature, and states that the law should be based on ethics, morals, and what is \"correct\". Analytical deals with the law as it already exists, Historical postulates that the law was found and not made, and Sociological studies how the law and society impact each other. The answer is (A).\n\nQ: Which word best summarizes Weber's explanation of the development of formally rational law?\n(A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Weber explained the development of formal rationality in laws as how the modern society moved from tradition to rationality, where people decide actions based less on how they were culturally done and more on expected utilities. How rational individuals optimize efficiency of accomplishing tasks for higher rewards is a core principle of Capitalism. The answer is (D).\n\nQ: Which position does Rawls claim is the least likely to be adopted by the POP (people in the original position)?\n(A) The POP would choose equality above liberty. (B) The POP would opt for the 'maximin' strategy. (C) The POP would opt for the 'difference principle'. (D) The POP would reject the 'system of natural liberty.'\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. The POP would opt for the 'maximin' strategy, opt for the 'difference principle', and reject the 'system of natural liberty', but the POP would not choose equality above liberty, since the POP assume both equal and free citizens. The answer is (A).", "logical_fallacies": "The following are multiple choice questions (with answers) about logical fallacies.\n\nQ: When an arguer causes confusion during refutation because of real or feigned lack of an ability to engage in refutation, that arguer may have committed the fallacy of\n(A) poor sportsmanship (B) appeal to compassion (C) argument against the person (D) ignorance of refutation\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Ignorance of refutation, one of Aristotle's original list of logical fallacies in his Organon, is when someone causes confusion in an argument through real or feigned inability to engage in refutation, in order to win the argument. The answer is (D).\n\nQ: The complex question fallacy consists of\n(A) arguing something is inferior just because it doesn't do something it was never intended to do. (B) including more than one claim in the proposition and treating proof for one claim as proof for all the claims. (C) drawing a conclusion before examining the evidence, and only considering evidence that supports that conclusion. (D) asking a question that includes either an unproven assumption or more than one question, thus making a straightforward yes or no answer meaningless.\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The complex question fallacy is when someone makes a single yes or no answer to a question meaningless, by including either an unproven assumption or many questions. The latter is also known as the many questions fallacy. The answer is (D).\n\nQ: Arguing that what is true of the parts must be true of the whole is the fallacy of...\n(A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Fallacy of composition occurs when someone argues what is true of the parts must be true of the whole. The answer is (B).\n\nQ: Which of the following is true of a valid categorical syllogism?\n(A) The minor premise must deny the antecedent (B) The major premise must affirm the consequent (C) The middle term must be used in at least one premise in a universal or unqualified sense (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. A valid categorical syllogism must satisfy several conditions: (1) the syllogism must have exactly three terms (2) every term of the syllogism must be used twice exactly, (3) a term may be used only once in any premise, and (4) the middle term must be used in at least one premise in a universal or unqualified sense, etc. Only (C) is true. The answer is (C).\n\nQ: If someone attacks the character of an opposing arguer, instead of responding to that opponent's arguments, the first person has probably committed which of the following fallacies?\n(A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The argument against the person fallacy occurs when someone irrelevantly attacks the character of an opposing arguer, instead of addressing that opponent's arguments. The answer is (C).", "machine_learning": "The following are multiple choice questions (with answers) about machine learning.\n\nQ: Which image data augmentation is most common for natural images?\n(A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization (D) dithering\nA: Let's think step by step. Data augmentation is used to increase the diversity of images in the training dataset. It is important that natural images are kept natural after being augmented. Vertical flips of images are not natural, so (B) is false. Posterization makes the image look like a poster and and dithering increases color depth. None of these two preserve the natural property. The only natural data augmentation technique is (A). The answer is (A).\n\nQ: Traditionally, when we have a real-valued input attribute during decision-tree learning we consider a binary split according to whether the attribute is above or below some threshold. Pat suggests that instead we should just have a multiway split with one branch for each of the distinct values of the attribute. From the list below choose the single biggest problem with Pat\u2019s suggestion:\n(A) It is too computationally expensive. (B) It would probably result in a decision tree that scores badly on the training set and a testset. (C) It would probably result in a decision tree that scores well on the training set but badly on a testset. (D) It would probably result in a decision tree that scores well on a testset but badly on a training set.\nA: Let's think step by step. Because the input is real valued, it is unlikely that the same values appear both at training and test time. This means that while such a decision tree could yield good performance on the training data, when evaluated on the test data it will perform badly because the decision tree won\u2019t know what to do with numbers that did not appear in the training data. The answer is (C).\n\nQ: You are reviewing papers for the World\u2019s Fanciest Machine Learning Conference, and you see submissions with the following claims. Which ones would you consider accepting?\n(A) My method achieves a training error lower than all previous methods! (B) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise test error.) (C) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.) (D) My method achieves a cross-validation error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.)\nA: Let's think step by step. In machine learning, we train with some data and fixed hyperparameters and the training error can be arbitrarily low, so (A) can\u2019t be right. Then, one compares different hyperparameters by selecting the model with the lowest cross-validation error, this means that (B) and (D) are not the right procedure. The only relevant number after these is the test error and thus (C) is the right answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times; side 6: 5 times. Based on these results, what is the probability of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one to the different counts and then normalizes the probabilities accordingly. The counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7. The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality the IID test set must have how many examples?\n(A) around 10 examples (B) around 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA: Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with 95% probability the in-sample and out-of-sample errors differ by epsilon when we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2) log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than 1000 examples. The answer is (D).", "management": "The following are multiple choice questions (with answers) about management.\n\nQ: How can organisational structures that are characterised by democratic and inclusive styles of management be described?\n(A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Flat organizational structures are characterized by democratic and inclusive styles of management, and have few (if any) levels of management between the workers and managers.  The answer is (C).\n\nQ: Hygiene factors are associated with which writer?\n(A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Hygiene factors include compensation, company policies, supervision, interpersonal relations, and work environments. Hertzberg lists them as factors that cannot motivate employees but can minimize job dissatisfaction. The answer is (A).\n\nQ: What characteristic is not a key feature of the 'open systems' model of management?\n(A) Morale (B) Innovation (C) Growth resource (D) Adaptation\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The key characteristics of an open system in management include innovation, growth resource, and adaption, but do not include morale. The answer is (A).\n\nQ: Which element of the cultural web forms regalia?\n(A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The cultural web is a tool for mapping an organization's culture, where symbols form the regalia that visually expresses the values that the organization holds as important. The answer is (A).\n\nQ: What are the two main dimensions of the Ohio Studies into leadership?\n(A) Starting position and end position (B) Initial environment and changed environment (C) Organisational structure and conditioning (D) Initiating structure and considerations\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The Ohio State Leadership Studies conducted in the 1940s identified initiating structure and consideration as the two main dimensions of leader behavior. The answer is (D).", "marketing": "The following are multiple choice questions (with answers) about marketing.\n\nQ: Although the content and quality can be as controlled as direct mail, response rates of this medium are lower because of the lack of a personal address mechanism. This media format is known as:\n(A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Door to door marketing delivers non-addressed items within all buildings within a geographic area. While it can control the content and quality as well as direct mail marketing, its response rate is lower because of the lack of a personal address mechanism. The answer is (D).\n\nQ: In an organization, the group of people tasked with buying decisions is referred to as the _______________.\n(A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making unit.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. In an organization, the group of the people tasked with buying decision is referred to as the decision-making unit. The answer is (D).\n\nQ: The single group within society that is most vulnerable to reference group influence is:\n(A) The older consumer who feels somewhat left out of things. (B) The married women, many of whom feel a need for stability in their lives. (C) New immigrants who really want to assimilate into their new culture. (D) Children, who base most of their buying decisions on outside influences.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Children, who mostly based their buying decisions on outside influences, are the single group within society that is more vulnerable to reference group influence. The answer is (D).\n\nQ: Which of the following is an assumption in Maslow's hierarchy of needs?\n(A) Needs are dependent on culture and also on social class. (B) Lower-level needs must be at least partially satisfied before higher needs can affect behaviour. (C) Needs are not prioritized or arranged in any particular order. (D) Satisfied needs are motivators, and new needs emerge when current needs remain unmet.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Maslow's hierarchy of needs, from the bottom upwards, are physiological (food and clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level needs must be at least partially satisfied before higher ones can affect behavior. The answer is (B).\n\nQ: _____________ is a natural outcome when combining demographic and geographic variables.\n(A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Geodemographics is a natural outcome when combining demographic and geographic variables. The answer is (A).", "medical_genetics": "The following are multiple choice questions (with answers) about medical genetics.\n\nQ: The stage of meiosis in which chromosomes pair and cross over is:\n(A) prophase I (B) metaphase I (C) prophase II (D) metaphase II\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Prophase I is the stage of meiosis where homologous chromosomes pair with each other and exchange genetic material. The answer is (A).\n\nQ: DNA ligase is\n(A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription of specific genes (D) an enzyme which limits the level to which a particular nutrient reaches\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA strands together by catalyzing a phosphodiester bond. The answer is (A).\n\nQ: Which of the following conditions does not show multifactorial inheritance?\n(A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D) Marfan syndrome\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Multifactorial inheritance is when more than a single factor is responsible for causing a given trait or health problem. Genes cannot be the only factor. Marfan syndrome, on the other hand, requires only one abnormal copy of the of the Marfan gene, from one parent, to inherit the trait. The answer is (D).\n\nQ: A gene showing codominance\n(A) has both alleles independently expressed in the heterozygote (B) has one allele dominant to the other (C) has alleles tightly linked on the same chromosome (D) has alleles expressed at the same time in development\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance where the phenotype of both the parents is easily observed in the offspring. A heterozygote is an individual having two different alleles of a gene. The answer is (A).\n\nQ: Large triplet repeat expansions can be detected by:\n(A) polymerase chain reaction. (B) single strand conformational polymorphism analysis. (C) Southern blotting. (D) Western blotting.\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. A Southern blot is a method in molecular biology for detecting specific DNA sequences in a sample. Large triplet repeat expansions are usually detected with this method. The answer is (C).", "miscellaneous": "The following are multiple choice questions (with answers) about miscellaneous.\n\nQ: Which of these songs was a Top 10 hit for the rock band The Police?\n(A) 'Radio Ga-Ga' (B) 'Ob-la-di Ob-la-da' (C) 'De Do Do Do De Da Da Da' (D) 'In-a-Gadda-Da-Vida'\nA: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly. Leaving 'De Do Do Do De Da Da Da' as the only song by The Police, and also a Top 10 hit. The answer is (C).\n\nQ: What place is named in the title of the 1979 live album by rock legends Cheap Trick?\n(A) Budapest (B) Budokan (C) Bhutan (D) Britain\nA: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album. The answer is (B).\n\nQ: What is produced during photosynthesis?\n(A) hydrogen (B) nylon (C) oxygen (D) light\nA: Let's think step by step. We refer to Wikipedia for help. Photosynthesis is the process in which green plants use the green pigment chlorophyll to synthesize foods with water and carbon dioxide. Oxygen is the byproduct of this process. The answer is (C).\n\nQ: Who is the shortest man to ever win an NBA slam dunk competition?\n(A) Anthony 'Spud' Webb (B) Michael 'Air' Jordan (C) Tyrone 'Muggsy' Bogues (D) Julius 'Dr J' Erving\nA: Let's think step by step. We refer to Wikipedia for help. In 1986, Spud Webb, standing only 5'7\" became the shortest NBA player in history to win an official slam dunk contest. The answer is (A).\n\nQ: How many axles does a standard automobile have?\n(A) one (B) two (C) four (D) eight\nA: Let's think step by step. We refer to Wikipedia for help. Most cars have two axles to rotate the wheels.. The answer is (B).", "moral_disputes": "The following are multiple choice questions (with answers) about moral disputes.\n\nQ: Baron admits that the versions of the ticking bomb hypothetical she discusses are \"stunningly stupid,\" but she claims this is actually evidence of\n(A) the stupidity of most traditional philosophical examples. (B) a general lack of intelligence among people with advanced degrees. (C) the wrongness of torture. (D) the readiness on the part of many intelligent people to see torture as the best solution to deal with terrorism.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. The ticking bomb hypothetical poses a problem where many people will die to an exploding bomb, if the hypothetical terrorist does not disclose how to defuse it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent people, particularly utilitarians, see torture as justifiable to save the lives in this scenario. The answer is (D).\n\nQ: A fertilized ovum is also known as\n(A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Once a single sperm penetrates the layers of an egg to form a new cell, that cell is called a zygote. The answer is (A).\n\nQ: Pence compares six different cases of reproduction, from natural twinning to SCNT. What conclusion does he draw from this comparison?\n(A) SCNT is not a different kind of reproduction because there are no morally relevant differences between it and other permissible means of reproduction. (B) Because there is a low risk of harm for natural twinning, there will be a low risk of harm for SCNT. (C) both A and B (D) neither A nor B\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. SCNT, known as 'somatic cell nuclear transfer', is a technique in which the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated egg. This is used in farm animal cloning. Pence may compare this to other processes of reproduction, and see little moral separation. The answer is (A).\n\nQ: Which of the following is an intrinsic good, according to Singer?\n(A) being in the upper class (B) the ability to learn and solve problems (C) being more beautiful than most of the population (D) being shorter than average\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable criteria, but capabilities are intrinsically good. The answer is (B).\n\nQ: According to Metz, what is wrong with consequentialist arguments against capital punishment based on African values?\n(A) It is unclear as of yet whether or not capital punishment deters harm to the community. (B) It is unclear as of yet whether or not capital punishment deters harm to any individuals. (C) Consequentialism is not supported by African values. (D) Even though consequentialism is supported by African values, no consequentialist arguments framed in terms of African values have been offered.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy in South Africa. Metz has written the death penalty is unjustified as it treats individuals as incapable of communal relations. It is unclear that capital punishment is to the benefit of, or a deterrent of harm to the community. The answer is (A).", "moral_scenarios": "The following are multiple choice questions (with answers) about moral scenarios.\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario 2 | I made love to my newborn baby boy.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Stealing from someone is not considered an ethical response to harrassment. And having sex with a minor is egregiously unethical. The answer is (A).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario 2 | When the security guard left his post, I pocketed a rare coin.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Asking for something is not morally wrong. Stealing is morally wrong. The answer is (C).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't care about her racial background.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Physical violence is morally wrong. Being open to hiring people from different racial backgrounds is an expected moral stance. The answer is (B).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a rock at it.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Neither of these are wrong as no one is wronged by buing a pick-up truck or throwing a rock into a stream. The answer is (D).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Loving someone is not wrong. However, exposing something that someone is embarrassed about could be considered quite mean. The answer is (C).", "nutrition": "The following are multiple choice questions (with answers) about nutrition.\n\nQ: What is the first-line drug for patients with type 2 diabetes and obesity, as of 2020?\n(A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed for type 2 diabetes, as well as obesity. It works by lowering glucose production in the liver and improving the body's sensitivity to insulin. The answer is (B).\n\nQ: Which of the following statements is correct (according to knowledge in 2020)?\n(A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame K\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine. As it builds up in the blood and brain it can lead to brain damage. People with PKU should avoid foods that are converted to phenylalanine in the body, such as aspartame. The answer is (A).\n\nQ: Which of the following statements about iodine is correct, as of 2020?\n(A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are a poor source of iodine (C) The iodine content of organic milk is generally lower that the level in non-organic milk (D) UK dietary reference values recommend an increase in iodine intake in pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Organic milk usually has less iodine content than non-organic milk. The answer is (C).\n\nQ: Which of the following is the most plausible explanation for the protective effect of dietary fibre against cancer of the colon, as of 2020?\n(A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates \"silencing\" of the SLC5A8 tumour suppressor gene (C) None of these options are correct (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant defences in the colon\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Dietary fibre is inversely proportional to the risk of colorectal cancer. This is presumed because butyric acid (BA) stimulates antioxidants which help protect the colon from cancerous tumors. The answer is (D).\n\nQ: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which one statement is correct (according to knowledge in 2020)?\n(A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers have higher risk of developing diabetes. (C) Based on the information given we cannot tell if the observed difference in disease risk is the result of chance. (D) The risk ratio is close to the value one, so there is no difference in disease risk between the two groups.\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. The risk ratio is not sufficiently reduced that it could not be explained by random chance given the studies sample size. The answer is (C).", "philosophy": "The following are multiple choice questions (with answers) about philosophy.\n\nQ: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n(A) metaphysics (B) epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics studies the nature of reality and existence. The answer is (A).\n\nQ: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good. (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C) nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Before his conversion, Tolstoy feels that life was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ: According to d'Holbach, people always act according to _____.\n(A) free choices (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. d'Holbach believes that people act according to necessary laws, and it proves nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism is:\n(A) an ethical theory about how we ought to behave. (B) a generalization concerning the way people tend to behave. (C) a claim about human nature and the ways people are capable of behaving. (D) none of the above.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Psychological egoism suggests that one behaves based on what makes one feels good, hence it is a claim about human nature and how humans are capable of behaving. The answer is (C).", "prehistory": "The following are multiple choice questions (with answers) about prehistory.\n\nQ: What is the approximate mean cranial capacity of Homo erectus?\n(A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer is (C).\n\nQ: According to Timothy Pauketat, the evidence for social stratification and political power at Cahokia suggests:\n(A) a center of Mississippian civilization with conditions similar to the rise of early states. (B) the limitations of authority in a Native American society of egalitarian foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D. 1500. (D) a center of Mississippian civilization with conditions similar to societies on the Northwest Coast of North America.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian culture, where he found similar conditions to the rise of early states. The answer is (A).\n\nQ: Recent research on hominid species dating from the Middle Pliocene indicates there was (as of 2020):\n(A) a great amount of species diversity, or a single species that exhibited a lot of diversity. (B) very little species diversity during this period and very few hominids. (C) decreased species diversity due to a prolonged ice age followed by a severe drought. (D) decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Recent research has recognized multiple hominid species from the Middle Pliocene, meaning that there is a great amount of species diversity or diversity in a single species. The answer is (A).\n\nQ: Researchers now believe that the decline of the Maya was caused chiefly by:\n(A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological degradation resulting from slash-and-burn farming techniques. (C) endless wars between neighboring Mayan city-states. (D) practices of interbreeding that led to a steep rise in congenital disorders.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Researchers believe that the Maya collapse was mainly caused by over-exploitation of natural resources like the slash-and-burn farming techniques. The answer is (B).\n\nQ: The great Mayan king Pacal built temples in the city of Palenque in order to:\n(A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to the common people, since they were allowed to live in the temples. (C) frighten away enemies, in particular the Spaniards. (D) legitimize his kingship, since his father was not royal.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Pacal built the temples as the funerary monument to legitimize his kingship. The answer is (D).", "professional_accounting": "The following are multiple choice questions (with answers) about professional accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019s subledger. Which of the following management assertions is supported by this test?\n(A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The completeness assertion is tested by tracing supporting documents to the record entries. The answer is (B).\n\nQ: One hundred years ago, your great-great-grandmother invested $100 at 5% yearly interest. What is the investment worth today?\n(A) $13,000 (B) $600 (C) $15,000 (D) $28,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. A $100 investment at 5% yearly interest is worth 100*(1.05)^100=13150 after 100 years, which is around $13,000. The answer is (A).\n\nQ: On January 1, year 1, Alpha Co. signed an annual maintenance agreement with a software provider for $15,000 and the maintenance period begins on March 1, year 1. Alpha also incurred $5,000 of costs on January 1, year 1, related to software modification requests that will increase the functionality of the software. Alpha depreciates and amortizes its computer and software assets over five years using the straight-line method. What amount is the total expense that Alpha should recognize related to the maintenance agreement and the software modifications for the year ended December 31, year 1?\n(A) $5,000 (B) $13,500 (C) $16,000 (D) $20,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The maintenance period begins on March 1, so only 10 months of expenses should be recognized, which is $15,000/12*10=$12,500. The software modification cost is amortized over 5 years, so each year is $5,000/5=$1,000. So the total expense is $12,500+$1,000=$13,500. The answer is (B).\n\nQ: Krete is an unmarried taxpayer with income exclusively from wages. By December 31, year 1, Krete's employer has withheld $16,000 in federal income taxes and Krete has made no estimated tax payments. On April 15, year 2, Krete timely filed for an extension request to file her individual tax return, and paid $300 of additional taxes. Krete's year 1 tax liability was $16,500 when she timely filed her return on April 30, year 2, and paid the remaining tax liability balance. What amount would be subject to the penalty for underpayment of estimated taxes?\n(A) $0 (B) $500 (C) $1,650 (D) $16,500\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The tax due after withholding is $16,500-$16,000=$500, which is less than $1000, hence there is no underpayment penalty of estimated taxes. The answer is (A).\n\nQ: Box a nongovernmental not-for-profit organization had the following transactions during the year: Proceeds from sale of investments $80000 Purchase of property plant and equipment $10000 Proceeds from long-term debt $100000 Loss on sale of investment $5000 What amount should be reported as net cash provided by financing activities in Box's statement of cash flows?\n(A) $70,000 (B) $75,000 (C) $80,000 (D) 100000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. Among the four transactions, only Proceeds from long-term debt belongs to the financing activities section of cashflow, hence the amount reported should be $100000. The answer is (D).", "professional_law": "The following are multiple choice questions (with answers) about professional law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor and told him that he wanted to pay the son's debt. The father signed a document that stated the father would pay the son's debt at a rate of $500 a month for 10 months. The creditor made no written or oral commitment to forbear to sue the son to collect the $5,000 debt, and the father made no oral or written request for any such forbearance. For the next five months, the father made and the creditor accepted the $500 monthly payments as agreed. During that period, the creditor, in fact, did forbear to take any legal action against the son. However, the father then informed the creditor that he would make no further payments on the debt. Which of the following is the most persuasive argument that the father is liable to the creditor under the terms of their agreement?\n(A) The father's promise and the creditor's reliance thereon, if proved, gave rise to a valid claim by the creditor against the father based on the doctrine of promissory estoppel. (B) Because it was foreseeable that the father's promise would induce the creditor to forbear taking any action against the son, such forbearance was, as a matter of law, a bargained-for consideration for the father's promise. (C) The father's five payments to the creditor totaling $2,500 manifested a serious intent on the father's part to be contractually bound, and such manifestation is generally recognized as an effective substitute for consideration. (D) By assuming the antecedent debt obligation that the son owed to the creditor, the father became a surety whose promise to the creditor was enforceable, since it was in writing and supported by adequate consideration. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The doctrine of promissory estoppel stops a person from going back on a promise in contract law, hence option (A) should be the most persuasive argument. The answer is (A).\n\nQ: A state has recently enacted a statute prohibiting the disposal of any nuclear wastes within the state. This law does not contravene or conflict with any federal statutes. A man operates a company in the state that is engaged in the disposal of nuclear wastes. Subsequent to the passage of the state statute, the man, not yet aware of the new law, entered into contracts with many out-of-state firms to dispose of their nuclear wastes in the state. On account of this new law, however, the man will be unable to perform these contracts. Assume that the man has standing to challenge this state law. Which of the following presents his strongest constitutional grounds to challenge the state law prohibiting the disposal of nuclear wastes within the state?\n(A) The commerce clause. (B) The equal protection clause of the Fourteenth Amendment. (C) The privileges and immunities clause of Article IV, Section 2. (D) The contract clause.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The commerce clause states that Congress shall have the power to regulate commerce with foreign Nations, and among the several States, and with the Indian Tribes. The statute affects inter-state commerce which puts it into question. Hence the man's strongest argument should be the commerce clause. The answer is (A).\n\nQ: On October 1, 1980, a developer, owner of several hundred acres in a rural county, drafted a general development plan for the area. The duly recorded plan imposed elaborate limitations and restrictions upon the land in the plan, which was to be developed as a residential district. The restrictions were to extend to all persons acquiring any of the lots and to their heirs, assigns, and lessees. It was further provided that all subsequent owners would be charged with due notice of the restrictions. Among those restrictions in the general plan were the following:(22) A franchise right is created in a strip of land 10 feet in width along the rear of each lot for the use of public utility companies with right of ingress and egress. (23) No house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. In 2000, a retiree purchased one of the lots, built a house, and erected a fence in the rear of his property within the restricted area. In 2004, a teacher purchased a lot adjacent to the retiree's property and built a new house. Two years later, a librarian purchased the lot that adjoined the teacher's property. The three deeds to those properties each contained references to the deed book where the general plan was recorded. In 2008, the librarian began the construction of a seven-foot post-and-rail fence along the line dividing his lot with the teacher's, and along the center of the area subject to the franchise right. Although the teacher objected to its construction, the fence was completed. If the teacher seeks a mandatory injunction to compel removal of the librarian's fence, the court will most likely\n(A) grant relief, because the fence was in violation of the easement restriction. (B) grant relief, because the encroachment of the fence violated the restriction in the original plan. (C) deny relief, because the teacher failed to enforce the restriction against the retiree. (D) deny relief, because the fence would not be construed as \"a structure\" within the terms of the restriction. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The restrictions in the original plan say no house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. Hence the court will most likely grant relief because the fence violated the restriction in the original plan. The answer is (B).\n\nQ: Judge took judicial notice of some facts at the beginning of the trial. Which of the following is not an appropriate kind of fact for judicial notice?\n(A) Indisputable facts. (B) Facts that have been asserted by individual political organizations. (C) Facts recognized to be true by common knowledge. (D) Facts capable of scientific verification.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. Among the options, facts that have been asserted by individual political organizations is not an appropriate kind of fact for judicial notice. The answer is (B).\n\nQ: A state legislature has recently enacted a statute making it a misdemeanor to curse or revile or use obscene or opprobrious language toward or in reference to a police officer perfonning his duties. A student at a state university organized a demonstration on campus to protest the war. The rally was attended by a group of 50 students who shouted anti-war messages at cars passing by. To show his contempt for the United States, the student sewed the American flag to the rear of his jeans. When a police officer saw the flag sown on the student's jeans, he approached and told him to remove the flag or he would be placed under arrest. The student became angered and shouted at the police officer, \"Listen, you bastard, I'll wear this rag anywhere I please. \" The student was subsequently placed under arrest and charged with violating the state statute. The student subsequently brings suit in state court challenging the constitutionality of the statute. The strongest constitutional argument for the student is that\n(A) the statute is void for vagueness under the Fourteenth Amendment's due process clause. (B) the statute is invalid because it violates the petitioner's freedom of speech under the First Amendment. (C) the statute is an abridgment of freedom of speech under the First Amendment because less restrictive means are available for achieving the same purpose. (D) the statute is overbroad and consequently invalid under the First and Fourteenth Amendments.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The Fourteenth Amendment further supports the First Amendment by establishing a due process clause. Hence the strongest argument should be the statute is overbroad and consequently invalid under the First and Fourteenth Amendments. The answer is (D).", "professional_medicine": "The following are multiple choice questions (with answers) about professional medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with the complaint of right-sided rib pain when he runs long distances. Physical examination reveals normal heart and lung findings and an exhalation dysfunction at ribs\u00a04-5 on the right. Which of the following muscles or muscle groups will be most useful in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Among the options, only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with a\u00a03-week\u00a0history of low back pain. He denies any recent trauma but says that he climbs in and out of his truck numerous times a day for his job. Examination of the patient in the prone position reveals a deep sacral sulcus on the left, a posterior inferior lateral angle on the right, and a lumbosacral junction that springs freely on compression. The most likely diagnosis is\n(A) left-on-left sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The deep sulcus on the left, a posterior ILA on the right, with a negative spring test suggests a right-on-right sacral torsion. All other options have a deep sulcus on the right. The answer is (D).\n\nQ: A 44-year-old man comes to the office because of a 3-day history of sore throat, nonproductive cough, runny nose, and frontal headache. He says the headache is worse in the morning and ibuprofen does provide some relief. He has not had shortness of breath. Medical history is unremarkable. He takes no medications other than the ibuprofen for pain. Vital signs are temperature 37.4\u00b0C (99.4\u00b0F), pulse 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares shows erythematous mucous membranes. Examination of the throat shows erythema and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable cervical adenopathy. Lungs are clear to auscultation. Which of the following is the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr virus (C) Mycoplasma pneumonia (D) Rhinovirus\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms, especially the headache, suggest that the most likely cause is Rhinovirus. Epstein-Barr virus will cause swollen lymph nodes but there is no palpable cervical adenopathy. Lungs are clear to auscultation suggests it's not Mycoplasma pneumonia. The answer is (D).\n\nQ: A previously healthy 32-year-old woman comes to the physician 8 months after her husband was killed in a car crash. Since that time, she has had a decreased appetite and difficulty falling asleep. She states that she is often sad and cries frequently. She has been rechecking the door lock five times before leaving her house and has to count exactly five pieces of toilet paper before she uses it. She says that she has always been a perfectionist but these urges and rituals are new. Pharmacotherapy should be targeted to which of the following neurotransmitters?\n(A) Dopamine (B) Glutamate (C) Norepinephrine (D) Serotonin\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The patient feels sad and among the options, only Dopamine and Serotonin can help increase positive emotions. Serotonin also affects digestion and metabolism, which can help the patient's decreased appetite and sleep difficulty. The answer is (D).\n\nQ: A 42-year-old man comes to the office for preoperative evaluation prior to undergoing adrenalectomy scheduled in 2 weeks. One month ago, he received care in the emergency department for pain over his right flank following a motor vehicle collision. At that time, blood pressure was 160/100 mm Hg and CT scan of the abdomen showed an incidental 10-cm left adrenal mass. Results of laboratory studies, including complete blood count, serum electrolyte concentrations, and liver function tests, were within the reference ranges. The patient otherwise had been healthy and had never been told that he had elevated blood pressure. He takes no medications. A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine and metanephrine and plasma aldosterone concentrations. The patient was referred to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature 36.6\u00b0C (97.9\u00b0F), pulse 100/min, respirations 14/min, and blood pressure 170/95 mm Hg. Physical examination discloses no significant findings. Initial preoperative preparation should include treatment with which of the following?\n(A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure indicates hypertension. Phenoxybenzamine is used to treat hypertension caused by pheochromocytoma. The answer is (D).", "professional_psychology": "The following are multiple choice questions (with answers) about professional psychology.\n\nQ: In the construction of a multiple regression equation for purposes of prediction, the optimal combination of measures is one in which the predictors\n(A) are uncorrelated with each other but are moderately correlated with the criterion (B) have low correlations with each other and low correlations with the criterion (C) are highly intercorrelated with each other and moderately correlated with the criterion (D) have low correlations with the criterion bur are moderately correlated with each other\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The basis of multiple regression is to assess the relationship between one continuous variable and a set of independent variables. So the predictors should be uncorrelated with each other but are moderately correlated with the criterion. The answer is (A).\n\nQ: There are three ways to measure the Central Tendency: the Mean, the Median and the Mode. From your knowledge about them, what is the mode?\n(A) less sensitive to extreme scores than the mean (B) more useful for skewed distributions (C) sensitive to extreme values and highly skewed distributions (D) the most frequently occurring number\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The definition of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl Jung believed that a client's transference:\n(A) is a fantasy that distracts the client from reality. (B) represents \u201cmixed feelings\u201d toward the therapist. (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Transference is a phenomenon that a person's feelings are unconsciously redirected, so it reflects the client's personal and collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980) five cultural dimensions, the United States scores at the top of the scale on:\n(A) individualism. (B) individualism and power distance. (C) power distance and masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. US scores highest on individualism among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy clients asks your advice about a good weight- reduction program. You have investigated the programs in the community and are enrolled in the one you consider the best. This program offers a $50 bonus to its patrons for each new person they bring into the program. Under these circumstances, your most appropriate response would be to\n(A) tell your client the pros and cons of each program you know about except for the one in which you are enrolled (B) recommend to your client the program in which you are enrolled and explain the $50 bonus you will receive (C) recommend to your client the program in which you are enrolled and offer to have the $50 bonus credited to your client's account in the program (D) tell your client the pros and cons of each program you know about, but do not claim the $50 bonus if your client enrolls in your program\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Based on the circumstances, you should tell your client about the pros and cons of each program, but it would be inappropriate to receive the bonus, so you should not claim the $50 bonus. The answer is (D).", "public_relations": "The following are multiple choice questions (with answers) about public relations.\n\nQ: Earth Hour was a campaign launched by which organization?\n(A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife Fund. The answer is (D).\n\nQ: In issues management, what is the most proactive approach to addressing negative or misleading information posted online about your organization?\n(A) Buy domain names that could be used by opposition groups. (B) Post anonymous comments on blogs to combat this information. (C) Prepare a news release that discredits the inaccurate information. (D) Make policy changes to address complaints highlighted on these sites.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In issues management, the most proactive approach to addressing negative or misleading information posted online is to make policy changes to address complaints highlighted on those sites. The answer is (D).\n\nQ: At which stage in the planning process would a situation analysis be carried out?\n(A) Defining the program (B) Planning the program (C) Taking action and implementing ideas (D) Evaluation of the program\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Situation analyses are typically carried out during the planning process stage of defining the program. The answer is (A).\n\nQ: Which of these statements is true of the Vatican in 2010 at the time of the accusations of child abuse cover-ups?\n(A) There was a coordinated media response. (B) Consistent messages were communicated. (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility of the Vatican was upheld.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican took those criticisms as attacks on the Catholic Church. The answer is (C).\n\nQ: What should a public relations media practitioner do if she does not know the answer to a reporter's question?\n(A) Give the reporter other information she is certain is correct. (B) Say that the information is 'off the record' and will be disseminated later. (C) Say 'I don't know' and promise to provide the information later. (D) Say 'no comment,' rather than appear uninformed.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. If a public relations media practitioner does not know the answer to a reporter's question, they should say 'I don't know' and offer to provide the information later. The answer is (C).", "security_studies": "The following are multiple choice questions (with answers) about security studies.\n\nQ: What are the frameworks of analysis within which terrorism has been considered (as of 2020)?\n(A) Competition between larger nations has resulted in some countries actively supporting terrorist groups to undermine the strength of rival states. Terrorist networks are extended patronage clubs maintained and paid for by their donor states and are conceptualised as being like state actors, to be dealt with using military force. (B) Globalization has enabled the internationalization of terrorist activities by opening up their operational space, although coordination is still managed from a geographical base. This suggests that terrorist groups are nationally structured which means that terrorism cannot be considered in terms of a war to be defeated militarily without having serious implications on the indigenous population. (C) Terrorism can be viewed as a problem to be resolved by military means (war on terrorism), by normal police techniques (terrorism as crime), or as a medical problem with underlying causes and symptoms (terrorism as disease). (D) Terrorism is viewed as a criminal problem. The criminalization of terrorism has two important implications. Firstly, it suggests that terrorism can be eradicated - terrorists can be caught and brought to trial by normal judicial proceedings thereby removing the threat from society - and secondly, it suggests that preventative crime techniques are applicable to prevent its development.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. (A) is wrong because it is not competition between larger nations that causes terrorism. \n(B) is wrong because globalization is not the cause of terrorism.\n(C) is correct because the US undertook the war on terrorism. \n(D) is wrong because preventative crime techniques will likely not end terrorism. The answer is (C).\n\nQ: Which of the following is the best lens through which to investigate the role of child soldiers?\n(A) Child soldiers are victims of combat that need re-education and rehabilitation. (B) Children and their mothers are not active subjects in warfare and are best considered as subjects in the private sphere. (C) Children are most often innocent bystanders in war and are best used as signifiers of peace. (D) Children have political subjecthood that is missed when they are considered as passive victims of warfare.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Child soliders as a political topic can be missed when they are considered passive victims of warfare. The answer is (D).\n\nQ: How can we best describe the relationship between the state-centric approach and the concept of human security?\n(A) There are such wide divisions within the human security framework regarding the nature of threats and referent objects that no widely applicable comparisons between state-centric approaches and human security can be drawn. (B) By adopting the framework of human security, the limitations of the realist state-centric approach become evident. Whilst human security defines the referent object as the person or population, state-centric approaches prioritise the security of the state, de-prioritizing the pursuit of human security. (C) The state-centric approach to security is a faction of human security, usually defined within the broad school of human security. By being state-centric this approach prioritises the individual as the referent object in security studies. (D) Both the state-centric and human-centric approaches to security are mutually exclusive and offer a sufficient analytic framework with which to understand the international security system. It is therefore the role of security analysts to determine which of these substantial concepts is correct, and which should be discarded.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Human security focuses on a person or population whereas state-centric approaches focus on the state while deprioritizing human security. The answer is (B).\n\nQ: In order to become securitized, a threat must be presented in which of these ways?\n(A) As an existential threat that requires immediate and extraordinary action, posing a threat to the survival of the state or to societal security. (B) As requiring immediate and extraordinary action by the state, threatening the survival of a referent object and therefore warranting the use of measures not normally employed in the political realm. (C) As an urgent threat to the survival of the referent object, so serious that it legitimises the employment of extraordinary action in response. (D) As an urgent threat to the survival of the audience that requires extraordinary or emergency measures.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. To be securitized, a threat must be an urgent threat to the survival of the referent object. The answer is (C).\n\nQ: What distinguishes coercive diplomacy from military force?\n(A) Compellence is another term for coercive diplomacy, but covering a narrower set of criteria; compellence covers those threats aimed at initiating adversary action. A threat to coerce a state to give up part of its territory would count as coercive diplomacy, as long as that threat proactively initiates action before reactive diplomacy is taken. (B) Coercive diplomacy constitutes the threats of limited force to induce adversary's incentive to comply with the coercer's demands. It is an influence strategy that is intended to obtain compliance: the use of force to defeat an opponent first does not count. It leaves an element of choice with the target to comply, or to continue. (C) Military force, or the threat of military force, utilises fear to achieve strategic objectives. Coercive diplomacy is differentiated from this approach, because it does not use fear as a tool for coercing an adversary. (D) Coercive diplomacy is employed to use force but to limit its effects on the international community. Coercive diplomacy is an aggressive strategy that is intended to obtain compliance through defeat. It does not leave an element of choice with the target, the target either being forced to comply or engage in conflict. It seeks to control by imposing compliance by removing any opportunity for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Coercive diplomacy uses the threat of force to induce the opponent to comply with demands. The answer is (B).", "sociology": "The following are multiple choice questions (with answers) about sociology.\n\nQ: Which of the following is not a problem associated with official statistics on strike action?\n(A) most strikes go unnoticed by employers and the mass media (B) not all industrial disputes will be reported by the employer (C) the definition of strikes excludes those that involve fewer than ten workers or last less than one day (D) it is hard to compare strikes that were measured in different ways\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Official statistics on strike action can be problematic because not all industrial disputes will be reported by employers, the definition of strikes excludes those that involves fewer than ten workers or last less than one day, and it is hard to compare strikes that were measured in different ways. Thus, (A) is not a problem associated with official statistics on strike action. The answer is (A).\n\nQ: What does Berger (1963) describe as a metaphor for social reality?\n(A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Berger describes social reality using the metaphor of a puppet theatre. The answer is (C).\n\nQ: The term 'hegemony' refers to:\n(A) the tendency for the working class not to realize their own interests (B) a dominant ideology that legitimates economic, political and cultural power (C) a form of dual consciousness based on ideology and everyday experiences (D) a mode of payment given for outstanding topiary\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Hegemony refers to a dominant ideology that legitimates economic, policital, and cultural power. The answer is (B).\n\nQ: The shift from 'civil religion' to 'common religion' means that:\n(A) the increasing bureaucracy of the state has made religion only a marginal part of our lives (B) despite the weakening of traditional authority, our everyday lives and 'common sense' remain shaped by religious beliefs and values (C) religious participation in collective worship may have declined, but people still practise their faiths in private (D) people are much more likely to discuss their religious beliefs in public, informal settings\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The shift from civil religion to common religion means that despite the weakening of traditional authority, our everyday lives and common sense remain shaped by religious beliefs and values. The answer is (B).\n\nQ: Which of the following did the post-war welfare state of 1948 not aim to provide:\n(A) free health care and education for all (B) a minimum wage (C) full employment (D) universal welfare\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The post-war welfare state of 1948 aimed to provide free healthcare and education, full employment, and universal welfare. But it did not aim to provide a minimum wage. The answer is (B).", "us_foreign_policy": "The following are multiple choice questions (with answers) about us foreign policy.\n\nQ: How did Donald Trump attack globalization in the 2016 campaign?\n(A) Globalization had made men like him too rich (B) Globalization only benefited certain American states, such as New York (C) Liberal elites had encouraged globalization, while 'ordinary Americans' lost jobs because of it (D) Globalization encouraged damaging trade wars\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. Trump attacked globalization because he believed ordinary Americans lost jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer is (C).\n\nQ: How did NSC-68 change U.S. strategy?\n(A) It globalized containment. (B) It militarized containment. (C) It called for the development of the hydrogen bomb. (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. NSC-68 outlined a variety of courses of action, including globalization of containment, militarization of contaiment, and the development of the hydrogen bomb. The answer is (D).\n\nQ: How do Defensive Realism and Offensive Realism differ in their explanation of state behaviour?\n(A) Defensive realists place greater emphasis on the role of international institutions (B) Defensive realists place less emphasis on geographical factors (C) Offensive realists give more priority to the national interest than Defensive realists. (D) Defensive realists believe states are security maximizers, while Offensive realists believe states to be power maximizers\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. While defensive realism advocates that states are security maximizers, offensive realists think of states as power maximizers. The answer is (D).\n\nQ: The realm of policy decisions concerned primarily with relations between the United States and the rest of the world is known as\n(A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international policy.\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The topic of policy decisions concerns with relations between the US and the rest of the world is known as foreign policy. The answer is (C).\n\nQ: How did the 2008 financial crisis affect America's international reputation?\n(A) It damaged support for the US model of political economy and capitalism (B) It created anger at the United States for exaggerating the crisis (C) It increased support for American global leadership under President Obama (D) It reduced global use of the US dollar\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The 2008 financial crisis damanged the international reputation of the American model of political economy and capitalism. The answer is (A).", "virology": "The following are multiple choice questions (with answers) about virology.\n\nQ: The median survival time to AIDS and death was established by following:\n(A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters (D) High-risk seronegatives\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The median survival time to AIDS and death was established as a result of the development of seroconverters. The answer is (C).\n\nQ: Which of the following is a morphological characteristic of the paramyxoviruses.\n(A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate viruses (C) Icosahedral viruses with envelope (D) Very large viruses\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside. The answer is (A).\n\nQ: The most important goal of a behavioral intervention is:\n(A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral theory (D) Sustained behavior change\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The prim goal of a behavioral intervention is to cause sustained behavior change. The answer is (D).\n\nQ: A key factor facilitating the application of nested case-control studies from the MACS was:\n(A) Data collection (B) Establishment of a repository of biologic specimens (C) Participant interest (D) Administration of the questionnaire by staff\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The Multicenter AIDS Cohort Study's use of nested case-control studies was facilitated by the establishment of a repository of biologic specimens. The answer is (B).\n\nQ: Why are parvoviruses a highly impactful parasite?\n(A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate in dividing cells (D) Can integrate into host chromosomes\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paroviruses are highly impactful because they do not have nucleic acid. The answer is (A).", "world_religions": "The following are multiple choice questions (with answers) about world religions.\n\nQ: How can the Upanishads be characterized?\n(A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Upanishads are the most recent part of Vedas (the oldest scriptures in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical texts. The answer is (B).\n\nQ: What is the Second Gem in Buddhism?\n(A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Second Gem in Buddhism is The Dharma. The answer is (A).\n\nQ: Which Japanese government promoted a kind of national cult based on the emperor and his associations with kami?\n(A) Honen (B) Tanaka (C) Tokugawa (D) Meiji\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The promotion of a national cult based on the emperor and his associations with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is (D).\n\nQ: In which dynasty was the \"Mandate of Heaven\" developed to legitimatize the new rulers?\n(A) Shang (B) Zhou (C) Han (D) Xia\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The \"Mandate of Heaven\" was developed as an ancient Chinese philosophical concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).\n\nQ: What is the sign of the covenant for Jewish males?\n(A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. In Judaism, the most distinctive sign of the covenant is circumcision (brit milah). The answer is (B)."}
\ No newline at end of file
+{"abstract_algebra": "The following are multiple choice questions (with answers) about abstract algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. A cyclic group is a group that is generated by a single element. Hence a subgroup generated by a single element of a group is cyclic and Statement 1 is True. The answer is (C).\n\nQ: The symmetric group $S_n$ has $\nactorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.\nFind the characteristic of the ring 2Z.\n(A) 0 (B) 3 (C) 12 (D) 30\nA: Let's think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).\n\nQ: Statement 1| Every function from a finite set onto itself must be one to one. Statement 2 | Every subgroup of an abelian group is abelian.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 1 is true. Let $S$ be a finite set. If $f:S \nightarrow S$ is a onto function, then $|S| = |f(S)|$. If $f$ was not one to one, then for finite domain $S$ the image would have less than $S$ elements, a contradiction.\nStatement 2 is true. Let $G$ be an abelian group and $H$ be a subgroup of $G$. We need to show that $H$ is abelian. Let $a,b \\in H$. Then $a,b \\in G$ and $ab=ba$. Since $G$ is abelian, $ab=ba$. Since $H$ is a subgroup of $G$, $ab \\in H$. Therefore, $ab=ba$ and $H$ is abelian. The answer is (A).\n\nQ: Statement 1 | If aH is an element of a factor group, then |aH| divides |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G.\n(A) True, True (B) False, False (C) True, False (D) False, True\nA: Let's think step by step. Statement 2 is false. Let $H$ be a subgroup of $S_3$ generated by the cycle $(1,2)$ and $K$ be a subgroup of $S_3$ generated by the cycle $(1,3)$. Both $H$ and $K$ have two elements, the generators and the identity. However $HK$ contains cycles (1,2), (1,3) and (2,3,1), but the inverse of (2,3,1) is (2,1,3) and it does not belong to HK, hence HK is not a subgroup. The answer is (B).\n\nQ: Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.\n(A) 0 (B) 1 (C) 2 (D) 3\nA: Let's think step by step. Z_3[x]/(x^2 + c) is a field if and only if x^2 + c does not have roots in Z_3. That is x^2 + c != 0 for every x in Z_3. If c = 0, then x^2 + c = x^2 has root 0. If c = 1 then x^2 + c = x^2 + 1 = 0 + 1 for x = 0, 1 + 1 = 2 for x = 1 and 1 + 1 = 2 for x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2 has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only if c = 1. The answer is (B).", "anatomy": "The following are multiple choice questions (with answers) about anatomy.\n\nQ: Which of the following is the body cavity that contains the pituitary gland?\n(A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The pituitary gland is the major endocrine gland attached to the base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n(A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. \nWe know the following: (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve. (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal nerve. Because all these nerves are purely sensory nerves and do not contain any somatic motor processes. Therefore, the answer should be none of the above, which is (D). The answer is (D).\n\nQ: In Angle's Class II Div 2 occlusion there is\n(A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper central incisors. (C) excess overjet of the upper lateral incisors. (D) excess overjet of the upper central incisors.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. This is a question related to anatomy and orthodontics. Excess overjet is associated with Class II occlusions; therefore, we can safely eliminate (B) from the list, as negative overjet is often associated with Class III occlusions. Now, we need to determine the location of the excess overjet, and that would be the upper (maxillary) lateral incisors. Only (C) has the correct information. The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. First, recall that the pleura refers to the thin layer of tissue that covers the lungs and lines the interior wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201cThe pleura have no sensory innervation.\u201d This information is not correct. The pleura do have a sensory innervation.\nOption (B): \u201cThe pleura are separated by a 2 mm space.\u201d This information is not correct. There is a very thin \u201cpotential\u201d space between the layers of the pleura; however, it is typically filled with serous pleural fluid. \nOption (C): \u201cThe pleura extend into the neck.\u201d This information is actuakky true. The cervical pleura, also known as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural cavity into the neck.\nOption (D): \u201cThe pleura are composed of respiratory epithelium.\u201d This information is not correct. The pleaura are composed of connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the only correct answer. The answer is (C).\n\nQ: What is the embryological origin of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\nA: Let's think step by step. We refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step by step. The hyoid bone, which is also known as the hyooid, is a a small U-shaped bone located in the anterior neck. In its resting position, it lies between the ase of the mandible and the third cervical vertebrae. We know that the second and the third pharyngeal arches give rise to the horns of the hyoid bone; therefore, the embryological origin of the hyoid bone are the second and the third pharyngeal arches\u2014this information is covered in the last option (D). Therefore, we conclude that (D) must be the correct answer. The answer is (D).", "astronomy": "The following are multiple choice questions (with answers) about astronomy.\n\nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the Kuiper belt. (C) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt. (D) The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud.\nA: Let's think step by step. Most short-period comets come from the Kuiper belt, and we know because short period coments tend to be in the plane of the solar system, just like the Kuiper belt is. The answer is (A).\n\nQ: You are pushing a truck along a road. Would it be easier to accelerate this truck on Mars? Why? (Assume there is no friction)\n(A) It would be harder since the truck is heavier on Mars. (B) It would be easier since the truck is lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D) It would be the same no matter where you are.\nA: Let's think step by step. If we assume that there is no friction, the force needed to accelerate the truck is by Newton\u2019s second law only dependent on the mass of the truck. Hence (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?\n(A) 10000 times more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think step by step. The amount of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic collision. (B) There was not enough material in this part of the solar nebula to form a planet. (C) There was too much rocky material to form a terrestrial planet but not enough gaseous material to form a jovian planet. (D) Resonance with Jupiter prevented material from collecting together to form a planet.\nA: Let's think step by step. The asteroid belt is a stellar disc consisting of a large number of asteroids between Mars and Jupiter's orbits. The asteroids in this belt are affected by the gravitational pull from both other asteroids and nearby planets. Due to the strong gravitational force of Jupiter there are resonances that give rise to low density regions of asteroids known as the Kirkwood gap. So (B) and (C) are not correct since it\u2019s not a lack of material that prevents a planet from being formed, and (A) is incorrect because the Kirkwood gap would have prevented a planet from forming in the first place, and (D) is the correct option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere scatters more light at bluer wavelengths transmitting mostly red light. (C) Because Mars is covered with ancient lava flows which are red in color. (D) Because flowing water on Mars's surface altered the surface minerals several billion years ago.\nA: Let's think step by step. Option (B) is not correct because if the red color was caused by the scattering off the atmosphere, then the earth with a much thicker atmosphere would also look red. Options (C) and (D) are not specific enough about why the color of the surface would be red, while (A) is correct because it explains that the surface is red due to the rusted materials on the surface and the red color comes from the rust. So the correct option is (A). The answer is (A).", "business_ethics": "The following are multiple choice questions (with answers) about business ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .\n(A) Buycotts, Boycotts, Blockchain technology, Charitable donations (B) Buycotts, Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased Sales\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cIn contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies. The success of such campaigns have been heightened through the use of *digital technology*, which allow campaigns to facilitate the company in achieving *increased sales*.\u201d The answer is (D).\n\nQ: _______ is the direct attempt to formally or informally manage ethical issues or problems, through specific policies, practices and programmes.\n(A) Corporate social responsibility (B) Business ethics management (C) Sustainability (D) Environmental management\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The direct attempt manage ethical issues through specific policies, practices, and programs is business ethics management. The answer is (B).\n\nQ: Three contrasting tactics that CSO's can engage in to meet their aims are ________ which typically involves research and communication, ________, which may involve physically attacking a company's operations or ________, often involving some form of _______.\n(A) Non-violent direct action, Violent direct action, Indirect action, Boycott (B) Indirect action, Instrumental action, Non-violent direct action, Information campaign (C) Indirect action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent direct action, Instrumental action, Indirect action, Information campaign\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cThree contrasting tactics that CSO's can engage in to meet their aims are *indirect action*, which typically involves research and communication, *violent direct action*, which may involve physically attacking a company's operations or *non-violent direct action*, often involving some form of *boycott*.\u201d The answer is (C).\n\nQ: To ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being drawn from _______ the company, being appointed for a _________ time period as well as being appointed _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cTo ensure the independence of the non-executive board members, there are a number of steps which can be taken, which include non-executives being draw from *outside* the company, being appointed for a *limited* time period as well as being imported *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging in CSR there are a number of moral arguments relating to: negative _______, the _______that corporations possess and the ________ of business and society.\n(A) Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual dependence\nA: Let's think step by step. We refer to Wikipedia articles on business ethics for help. The sentence that best uses the possible options above is \u201cBeyond the business case for engaging the CSR there are a number of moral arguments relating to: negative *externalities*, the *power* that corporations possess and the *mutual independence* of business and society. The answer is (D).", "clinical_knowledge": "The following are multiple choice questions (with answers) about clinical knowledge.\n\nQ: Glycolysis is the name given to the pathway involving the conversion of:\n(A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl CoA.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Glycolysis is the name given to the pathway involving conversion of glycogen or glucose to pyruvate or lactate. The answer is (C).\n\nQ: What is the difference between a male and a female catheter?\n(A) Male and female catheters are different colours. (B) Male catheters are longer than female catheters. (C) Male catheters are bigger than female catheters. (D) Female catheters are longer than male catheters.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The difference between a male and female catheter is that male catheters tend to be longer than female catheters. The answer is (B).\n\nQ: How many attempts should you make to cannulate a patient before passing the job on to a senior colleague, according to the medical knowledge of 2020?\n(A) 4 (B) 3 (C) 2 (D) 1\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. According to the medical protocol as of 2020, you should make two attempts to cannulate a patient before passing the job on to a more-senior practitioner. The answer is (C).\n\nQ: In the assessment of the hand function which of the following is true?\n(A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied by the median nerve (D) Finger abduction is mediated by the palmar interossei\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. Of all the options, it is only true that the opposition of the thumb by opponens pollicis is supplied by spinal root T1. The answer is (B).\n\nQ: The energy for all forms of muscle contraction is provided by:\n(A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.\nA: Let's think step by step. We refer to Wikipedia articles on clinical knowledge for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate), which is the powerhouse of the cell. The answer is (A).", "college_biology": "The following are multiple choice questions (with answers) about college biology.\n\nQ: Which of the following represents an accurate statement concerning arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan. (B) They possess an open circulatory system with a dorsal heart. (C) They are members of a biologically unsuccessful phylum incapable of exploiting diverse habitats and nutrition sources. (D) They lack paired, jointed appendages.\nA: Let's think step by step. Peptidoglycan is known to comprise the plasma membrane of most bacteria, rather than the exoskeleton of arthropods, which is made of chitin, which rules out (A). The answer (C) is false because arthropods are a highly successful phylum. Likewise, arthropods have paired, jointed appendages, which rules out (D). The only remaining option is (B), as arthropods have an open circulatory system with a dorsal tubular heart. The answer is (B).\n\nQ: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?\n(A) 1/400 (B) 19/400 (C) 20/400 (D) 38/400\nA: Let's think step by step. According to the Hardy Weinberg Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$, and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and heterozygous individuals, respectively. \u200bThe frequency of the recessive allele (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The number of heterozygous individuals is equal to the frequency of heterozygous individuals times the size of the population, or $0.095 * 400 = 38$. So we end up with 38/400. The answer is (D).\n\nQ: According to the pressure-flow model of movement of phloem contents, photosynthate movement from source to sink is driven by\n(A) an ATP-dependent pressure-flow pump (B) a water-pressure potential gradient (C) transpiration (D) apoplastic diffusion\nA: Let's think step by step. It is a gradient in water pressure that induces the movement of phloem content, which refers to answer (B). The mechanism of movement does not rely on metabolism, which rules out (A). Transpiration refers to the exhalation of water vapor through plant stomata, and is also not related, which rules out (C). While the apoplastic pathway is one of two main pathways for water transport in plants, it is not central to the pressure flow model, which rules out (D). The answer is (B).\n\nQ: Which of the following contain DNA sequences required for the segregation of chromosomes in mitosis and meiosis?\n(A) Telomeres (B) Centromeres (C) Nucleosomes (D) Spliceosomes\nA: Let's think step by step. The genetic material in Telomeres is not used, which rules out (A). Nucleosomes are the repeating subunit that comprises chromatin packed in a cell nucleus, and do not specifically refer to DNA sequences necessary for segregating chromosomes in cell division, which rules out (C). A spliceosome is a large ribonucleoprotein that removes introns from transcribed pre-mRNA rather than governing chromosome segregation. Centromeres are directly responsible for segregating chromosomes in cell division. The answer is (B).\n\nQ: The presence of homologous structures in two different organisms, such as the humerus in the front limb of a human and a bird, indicates that\n(A) the human and bird are polyphyletic species (B) a human's and bird's evolution is convergent (C) the human and bird belong to a clade (D) the human and bird developed by analogy\nA: Let's think step by step. Polyphyletic species are organisms that are grouped due to having similar characteristics but which do not have a common ancestor. This is not the case for humans and birds, which rules out (A). Convergent evolution refers to the indepdendent development of similar features in different species at different periods, which is also not the case for humans and birds, which rules out (B). Analogy refers to the superficial resemblance of structures that have different origins, which is not the case for the human and bird forearms, which rules out (D). Humans and birds do belong to the same clade - a group of organisms composed of a common ancestor. The answer is (C).", "college_chemistry": "The following are multiple choice questions (with answers) about college chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq) + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n(A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step by step. A molecule that behaves as a base accepts an H+ ion (or proton) from another molecule, whereas a molecule that behaves as an acid donates an H+ ion (or proton) to another molecule. Neither of these is the case for Cl in this reaction, which rules out (A) and (B). A catalyst is a substance that only accelerates a reaction without itself undergoing chemical change, which is not the case here. This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which they donate in the reaction to form 3 HClO2. This is the behavior of a reducing agent, or (D). The answer is (D).\n\nQ: Which of the following statements about the lanthanide elements is NOT true?\n(A) The most common oxidation state for the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate hydrogen. (D) The atomic radii of the lanthanide elements increase across the period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide elements in fact decrease across the period from La to Lu. Options (A), (B), and (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\nQ: Which of the following lists the hydrides of group-14 elements in order of thermal stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14 hydrides decreases as we move from the top of group 14 to the bottom. The order of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step by step. The electron paramagnetic resonance spectrum will be split by two forms of interactions. The first is the hyperfine interaction with the 13C (nuclear spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will be further split into 4 lines by the interaction with three equivalent 1H nuclei. The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E).", "college_computer_science": "The following are multiple choice questions (with answers) about college computer science.\n\nQ: Which of the following regular expressions is equivalent to (describes the same set of strings as) (a* + b)*(c + d)?\n(A) a*(c + d)+ b(c + d)\n(B) a*(c + d)* + b(c + d)*\n(C) a*(c + d)+ b*(c + d)\n(D) (a + b)*c +(a + b)*d\nA: Let's think step by step. We know that:\n1. (X* + Y)* = (X + Y)*\n2. X(Y + Z)? = XY + XZ\nUsing equation 1 we can rewrite (a* + b)*(c + d)? as:\n3. (a + b)*(c + d)?\nUsing equation 2 we can rewrite equation 3 as:\n(a + b)*c + (a + b)*d The answer is (D).\n\nQ: The Singleton design pattern is used to guarantee that only a single instance of a class may be instantiated. Which of the following is (are) true of this design pattern?\nI. The Singleton class has a static factory method to provide its instance.\nII. The Singleton class can be a subclass of another class.\nIII. The Singleton class has a private constructor.\n(A) I only\n(B) II only\n(C) III only\n(D) I, II, and III\nA: Let's think step by step. Statement I is a correct statement about a Singleton, because a Singleton restricts instantiation to a single, static method. Statement II is also correct, because there is no inherent restriction regarding the inheritance of a Singleton. Statement III is also correct, because a Singletons must be instantiated only once, so its constructor is made private to prevent any construction except via its static factory method.\nGiven these facts, statements I, II, and III are all correct. The answer is (D).\n\nQ: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . . . , R7 and supports the following operations:\nADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)\nMUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)\nAn operation normally takes one cycle; however, an operation takes two cycles if it produces a result required by the immediately following operation in an operation sequence.\nConsider the expression AB + ABC + BC, where variables A, B, C are located in registers R0, R1, R2. If the contents of these three registers must not be modified, what is the minimum number of clock cycles required for an operation sequence that computes the value of AB + ABC + BC?\n(A) 5 (B) 6 (C) 7 (D) 8\nA: Let's think step by step. First, we are given that A is in R0, B is in R1, and C is in R2.\nNext, we can see that we must compute three multiplies (AB, BC, and ABC) and two adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum of five clock cycles.\nNext, we can see that there is no way to avoid at least one pipeline stall when computing our final answer, because to compute our final sum we must wait at least one cycle for the results from the previous stage to be ready. Thus, our minimum number of cycles must be 6.\nWe can verify that we can create a solution that requires only six cycles as follows:\ncompute AB: MUL R0, R1, R3\ncompute BC: MUL R1, R2, R4\ncompute ABC: MUL R3, R4, R5\ncompute AB + BC: ADD R3, R4, R6\nSTALL\ncompute AB + ABC + BC: ADD R5, R6, R7\nSo there are 6 cycles. The answer is (B).\n\nQ: A compiler generates code for the following assignment statement.\nG := (A + B) * C - (D + E) * F\nThe target machine has a single accumulator and a single-address instruction set consisting of instructions load, store, add, subtract, and multiply. For the arithmetic operations, the left operand is taken from the accumulator and the result appears in the accumulator. The smallest possible number of instructions in the resulting code is\n(A) 5 (B) 6 (C) 7 (D) 9\nA: Let's think step by step. We can compute the final answer with the following sequence of operations:\n1. LOAD D  (accumulator = D)\n2. ADD E  (accumulator = D+E)\n3. MUL F  (accumulator = (D+E)*F)\n4. STORE X (X = (D+E)*F)\n5. LOAD A  (accumulator = A)\n6. ADD B  (accumulator = A+B)\n7. MUL C  (accumulator = (A+B)*C)\n8. SUB X  (accumulator = (A+B)*C - (D+E)*F)\n9. STORE G (G = (A+B)*C - (D+E)*F)\nThis sequence takes 9 instructions. The answer is (D).\n\nQ: Consider a computer design in which multiple processors, each with a private cache memory, share global memory using a single bus. This bus is the critical system resource. Each processor can execute one instruction every 500 nanoseconds as long as memory references are satisfied by its local cache. When a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds. During half of this additional delay, the bus is dedicated to serving the cache miss. During the other half, the processor cannot continue, but the bus is free to service requests from other processors. On average, each instruction requires 2 memory references. On average, cache misses occur on 1 percent of references. What proportion of the capacity of the bus would a single processor consume, ignoring delays due to competition from other processors?\n(A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27\nA: Let's think step by step. We know that each instruction requires two memory references per instruction, and that there is an average cache miss rate of one percent.\nThus a given processor has:\n(1 cache miss / 100 references) * (2 references / instruction) =\n(2 cache misses / 100 instructions), so:\nmisses_per_instruction = 1 cache miss / 50 instructions.\nNext, we know that each instruction requires 500 nanoseconds when there is no cache miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:\n50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:\ninstructions_per_ns = 50 instructions / 27000 nanoseconds.\nNow, we know that each cache miss locks the bus for half of the 2000 nanosecond cache miss delay, or 1000 nanoseconds, so:\nlock_ns_per_miss = 1000 nanoseconds / cache miss.\nThus we can see that on average a single processor will lock the bus for:\nlock_ns_per_miss * misses_per_instruction * instructions_per_ns =\n(1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer is (B).", "college_mathematics": "The following are multiple choice questions (with answers) about college mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST = T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x) + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x) + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11 (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\nA: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ: Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n(A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt per liter of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is continually mixed with the salt solution in the tank, and the mixture flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous, how many grams of salt are in the tank after 100 minutes have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably. We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100 liters of liquid in the tank, containing $s$ grams of salt. So the density of salt in the tank is $s / 100$ grams per liter. The flow of water out of the tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\frac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that, for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then, for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100 / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D).", "college_medicine": "The following are multiple choice questions (with answers) about college medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Creatine supplementation is a dietary supplement that results in body mass gain. The answer is (B).\n\nQ: Which of the following is not a true statement?\n(A) Muscle glycogen is broken down enzymatically to glucose-1-phosphate (B) Elite endurance runners have a high proportion of Type I fibres in their leg muscles (C) Liver glycogen is important in the maintenance of the blood glucose concentration (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step and go over each choice: \n(A) \u201cMuscle glycogen is broken down enzymatically to glucose-1-phosphate\u201d: This is a correct statement.\n(B) \u201cElite endurance runners have a high proportion of Type I fibres in their leg muscles\u201d: This is a correct statement.\n(C) \u201cLiver glycogen is important in the maintenance of the blood glucose concentration\u201d: This is a correct statement. \n(D) \u201cInsulin promotes glucose uptake by all tissues in the body\u201d: This is not a correct statement, because insulin promotes glucose uptake by the liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues in the brain and red blood cells are not affected by insulin. The answer is (D).\n\nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70 atm, and the room temperature is 25\u00b0C. Which two variables will both increase the pressure of the system, if all other variables are held constant?\n(A) Increasing temperature, increasing moles of gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase the pressure of the system (P), then either n (number of moles of the gas) or T (temperature) have to increase. The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder is found that has X-linked recessive transmission. Which of the following statements is likely true regarding the pedigree of this disorder?\n(A) All descendants on the maternal side will have the disorder. (B) Females will be approximately twice as affected as males in this family. (C) All daughters of an affected male will be affected. (D) There will be equal distribution of males and females affected.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s solve this step by step. Let's recall first that females have two X chromosomes, while males have one X and one Y chromosome. This is an important fact we need to know before answering this question. \nBecause a male can only pass his only one X chromosome to a daughter, if he is affected by this rare genetic disorder, then we know for sure that he will pass this rare genetic disorder to all his future-born daughters. Therefore, \u201c(C): All daughters of an affected male will be affected\u201d is a correct statement. The answer is (C).\n\nQ: Glucose is transported into the muscle cell:\n(A) via protein transporters called GLUT4. (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Glucose (also known as the blood sugar) is the main sugar found in the human body. It is transported into the muscle cell via diffusion through protein transporters called GLUT4. The answer is (A).", "college_physics": "The following are multiple choice questions (with answers) about college physics.\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of\n(A) charge (B) mass (C) energy and momentum (D) lepton number\nA: Let's think step by step. Lepton number must be conserved, meaning the total number of leptons minus the number of antileptons. If a muon decays into an electron and a single neutrino, the total lepton number would go from one to two, violating lepton number conservation. The answer is (D).\n\nQ: One end of a Nichrome wire of length 2L and cross-sectional area A is attached to an end of another Nichrome wire of length L and cross- sectional area 2A. If the free end of the longer wire is at an electric potential of 8.0 volts, and the free end of the shorter wire is at an electric potential of 1.0 volt, the potential at the junction of the two wires is most nearly equal to\n(A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V\nA: Let's think step by step. This is a simple voltage divider problem, where the longer wire has a resistance four times that of the shorter end. So the voltage divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).\n\nQ: A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4. The answer is (A).\n\nQ: For which of the following thermodynamic processes is the increase in the internal energy of an ideal gas equal to the heat added to the gas?\n(A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic\nA: Let's think step by step. Heat added to the gas can go into the gases internal energy or work done against an external force. However, if the volume of the gas container is constant, no work will be done (since work is pressure times change in volume). So, at constant volume, all of the heat goes into the internal energy. The answer is (B).", "computer_security": "The following are multiple choice questions (with answers) about computer security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628 bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function which takes an input and produces a 160-bit (20-byte) hash value, its message digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on your system \u2013 so that your system doesn\u2019t run correctly or you can no longer access specific data, or it may even ask for ransom in order to give your access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan\nA: Let's think step by step. The system is asking for trojans, which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities so they can be fixed, rather than exploited (C) Hacking into systems run by those whose ethics you disagree with (D) A slang term for rapid software development, e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities in an application, system, or organization's infrastructure that an attacker can use to exploit an individual or organization. They use this process to prevent cyberattacks and security breaches by lawfully hacking into the systems and looking for weak points. The answer is (B).\n\nQ: The ____________ is anything which your search engine cannot search.\n(A) Haunted web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step. The search engine searches on the Surface Web, which is the portion of the world wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond to an internet concept. The Deep Web is the part of the World Wide Web which is not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n(A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a read outside bounds of a buffer (D) a format string attack\nA: Let's think step by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL cryptographic software library. Heartbleed resulted from improper input validation (due to a missing bounds check) in the implementation of the TLS heartbeat extension. The vulnerability was classified as a buffer over-read, a situation where more data can be read than should be allowed. The answer is (C).", "conceptual_physics": "\nThe following are multiple choice questions (with answers) about conceptual physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different frequency (B) deflection (C) interference (D) polarization\nA: Let's think step by step. In a soap bubble film, the light bounces between the two soap-air interfaces many times, interfering with itself constructively or destructively depending on the width of the film. This results in different colors being visible. The answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission, the combined masses of the products after fission are\n(A) less (B) more (C) the same (D) zero\nA: Let's think step by step. Fission releases energy, which comes from the rest mass of its initial nucleus. Thus the mass of the products is less than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things that are equivalent according to the equivalence principle are\n(A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration. (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence principle states that gravity and acceleration are equivalent. The answer is (C).\n\nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear binding energy, the mass of an atomic nucleus is less than the sum of individual masses of the free constituent protons and neutrons; this is known as the mass defect. Hydrogen has no mass defect because it has only a single nucleon, so it will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane flies slower when flying into the wind and faster with wind at its back. When launched at right angles to the wind a cross wind its groundspeed compared with flying in still air is\n(A) the same (B) greater (C) less (D) either greater or less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed in the direction of the wind is greater than it would be in the absence of wind, and its direction orthogonal to the wind is the same as it would be in the absence of the wind. The total speed, which is these two components added in quadrature, is thus greater than the speed in still air. The answer is (B).", "econometrics": "The following are multiple choice questions (with answers) about econometrics.\n\nQ: Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bi-variate VAR, and the values of the determinant of the variance-covariance matrix of residuals are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What is the optimal model order according to Akaike's information criterion?\n(A) 1 lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. First of all, let\u2019s recall that for a given set of data, Akaike's information criterion (AIC) allows us to measure how well a statistical model fits the data; it is an estimator of prediction error. Here in this problem we will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike\u2019s criterion, where ln denotes the natural log function, det the determinant function, k the total number of parameters in total (across both equations), and T the number of observations (which, in this case, is equal to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags, it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500) = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) = ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084) + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the information criterion, the answer should be the one with the lowest value. In this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2 + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s solve this problem step by step. If we have a an AR(1) model with the disturbances having zero mean and unit variance, then the unconditional mean of y is equal to the following:\nunconditional mean of y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\nunconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately 0.33. That means that the answer should be (D) 0.33. The answer is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored?\n(A) It will be biased (B) It will be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Heteroscedasticity refers to the condition where the variance of the error terms is not constant across multiple observations. If heteroscedasticity is present in a regression model, then the coefficient estimates in the OLS estimator will be not only unbiased and consistent but also inefficient. Because (A) and (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true?\n(i) If the size of the test were exactly 8%, we would be indifferent between rejecting and not rejecting the null hypothesis\n(ii) The null would be rejected if a 10% size of test were used\n(iii) The null would not be rejected if a 1% size of test were used\n(iv) The null would be rejected if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n(iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii) are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on econometrics for help. This is a formal logic problem about stationally process. For a stationary autoregressive process, shocks will eventually die away. The answer is (A).", "electrical_engineering": "\nThe following are multiple choice questions (with answers) about electrical engineering.\n\nQ: A point pole has a strength of 4\u03c0 * 10^-4 weber. The force in newtons on a point pole of 4\u03c0 * 1.5 * 10^-4 weber placed at a distance of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's think step by step. The force between two point poles is given by m_1m_2/(mu_0 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in the question, we calculate that the force is approximately 15 N. The answer is (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's think step by step. The torque on a coil in a uniform magnetic field is given by BANI, where B is the magnetic flux density, A is the area of the coil, N is the number of turns, and I is the current. So we have that I = (Torque)/(BAN), or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset latch; in the case where S=1 and R=1, the circuit has no stable state; instead a race condition will be produced within the circuit, so the device will be in an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ: Two long parallel conductors carry 100 A. If the conductors are separated by 20 mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r), where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\nQ: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03a9 and total number of conductors is 100. Find the total resistance\n(A) 200\u03a9 (B) 100\u03a9 (C) 50\u03a9 (D) 10\u03a9\nA: Let's think step by step. In lap winding, effectively two resistors are connected in parallel, so the actual resistance of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50 Ohms. The answer is (C).", "elementary_mathematics": "The following are multiple choice questions (with answers) about elementary mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n(A) The 10th number in the pattern will be an even number.\n(B) The number pattern will never have two even numbers next to each other.\n(C) The next two numbers in the pattern will be an even number then an odd number.\n(D) If the number pattern started with an odd number then the pattern would have only odd numbers in it.\nA: Let's think step by step. Choice A is incorrect because every even-numbered term in the pattern is odd, and 10 is an even number. Choice B is correct, because adding an odd number (in this case 11) to an odd number produces an even number, and adding an odd number to an even number produces an odd number. Thus the terms in the pattern will alternate between odd and even, so there will never be two even numbers next to each other. Choice C is incorrect because the last term in the example is even (54), and we know that the terms will alternate between even and odd. Choice D is incorrect because the terms in the pattern will alternate between odd and even, regardless of the value of the first term. The answer is (B).\n\nQ: The population of the city where Michelle was born is 145,826. What is the value of the 5 in the number 145,826?\n(A) 5 thousands\n(B) 5 hundreds\n(C) 5 tens\n(D) 5 ones\nA: Let's think step by step. Choice A is correct, because there are three digits following the 5, so\nthe 5 is in the thousands place. Thus the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different colors of paint. They have 25 cans of each color in storage. The number of cans of paint the store has in storage can be found using the expression below. 107 \u00d7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B) 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107 x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total of 30 players will play basketball at a park. There will be exactly 5 players on each team. Which statement correctly explains how to find the number of teams needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n(C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\nA: Let's think step by step. We want to find the number of teams. We know that there are 5 players/team, and 30 players. Thus to get the number of teams we divide players by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n(B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5 x 4). The answer is (B).", "formal_logic": "The following are multiple choice questions (with answers) about formal logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the following sentence?\nTurtles live long lives and are happy creatures, unless they are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022 (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cL\u201d denote \u201cliving long\u201d, H \u201cbeing happy\u201d, and \u201cI\u201d \u201cbeing injured\u201d. Now, consider each choice:\n(A) means (living long AND being happy) is equivalent to (being injured). \n(B) means (living long AND being happy) OR (being injured). \n(C) means (living long) AND (being happy OR being injured). \n(D) means (living long) AND (being happy implies being R), but what R denotes is not clear.\nObviously, (B) is the best symbolization of the original sentence. The answer is (B).\n\nQ: Select the best translation into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. We are told that \u201cBxyx\u201d means \u201cx borrows y from z\u201d. We can rewrite \u201cGeorge borrows Hector's lawnmower\u201d as \u201cGeorge borrows a lawnmower from Hector\u201d, which can then be translated into predicate logic as \u201cBglh\u201d. The answer \u201cBglh\u201d appears in (C); therefore, (C) must be the correct answer. The answer is (C).\n\nQ: \nSelect the best English interpretation of the given arguments in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball. (B) Marina is a dancer. No weakling is a dancer. Everything is either a weakling or plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some weaklings are not dancers. Everything is either a weakling or plays volleyball. So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia plays volleyball. So something plays volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cD\u201d denote \u201cbeing a dancer\u201d, \u201cm\u201d denote \u201cMaria\u201d, \u201cg\u201d denote \u201cGeorgia\u201d, \u201cW\u201d denote \u201cweakling\u201d, \u201cA\u201d denote \u201cplaying volleyball\u201d. Then, we have the following:\n1. Dm \u2192 Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling, then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball. So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that some weaklings are not dancers, but the second argument strongly states that no weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important detail about Georgia playing volleyball. Option (D) has all the details presented in the arguments and is the best English interpretation of the arguments. The answer is (D).\n\nQ: Select the best translation into predicate logic: No people drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx) (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal logic for help. Let\u2019s solve this step by step. Let \u201cP\u201d denote \u201cbeing on Mars\u201d and \u201cD\u201d denote \u201cdriving on Mars\u201d. Then let\u2019s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these options, Option (C) appears to be the best and most meaningful interpretation of the argument \u201cNo people drive on Mars.\u201d The answer is (C).", "global_facts": "The following are multiple choice questions (with answers) about global facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. According to data published by the World Health Organization, the nummber of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\nQ: As of 2019, about what percentage of Americans agree that the state is run for the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. In 2019, about 46% percentage of Americans agree that the state is run for the benefit of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage of Russians say it is very important to have free media in our country without government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, about 38% of Russians say it is very important to have free media in our country. The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the Americas.\n(A) increased, increased (B) increased, decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2015, since 1990 forests have increased in Europe and have decreased in Africa and the Americas. The answer is (B).\n\nQ: Which of the following pairs of statements are both true (as of 2019)?\n(A) People tend to be optimistic about their own future and the future of their nation or the world. (B) People tend to be optimistic about their own future but pessimistic about the future of their nation or the world. (C) People tend to be pessimistic about their own future but optimistic about the future of their nation or the world. (D) People tend to be pessimistic about their own future and the future of their nation or the world.\nA: Let's think step by step. We refer to Wikipedia articles on global facts for help. As of 2019, most people tend to be optimistic about their own future but pessimistic about the future of their nation or the world. The answer is (B).", "high_school_biology": "The following are multiple choice questions (with answers) about high school biology.\n\nQ: In animal cells, which of the following represents the most likely pathway that a secretory protein takes as it is synthesized in a cell?\n(A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the endoplasmic reticulum and moves from there to the Golgi apparatus, where it is modified and packaged into a vesicle. The vesicle then floats to the plasma membrane and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed a previously polar amino acid into a nonpolar amino acid. This amino acid was located at a site distant from the enzyme\u2019s active site. How might this mutation alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing the shape of the protein (D) An amino acid change away from the active site cannot alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A change in an amino acid leads to a change in the primary structure of the protein. A change in the primary structure may lead to a change in the secondary and the tertiary structure of the protein. A change in the tertiary structure means a change in the shape of the protein, so (C) has to be correct. Since the change does not affect the active site of the enzyme, we do not expect the activity of the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation\nA: Let's think step by step. The introduction of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique in molecular biology and scientific research. There are multiple ways foreign DNA can be introduced into cells including transformation, transduction, conjugation, and transfection. In contrast, (A) is not a way to form DNA: during translation the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous structures are often cited as evidence for the process of natural selection. All of the following are examples of homologous structures EXCEPT\n(A) the wings of a bird and the wings of a bat (B) the flippers of a whale and the arms of a man (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200b\u200bHomologous structures are similar physical features in organisms that share a common ancestor \u200b\u200bbut different functions. Comparisons (B) and (C) are clearly homologous because they share a common ancestor and the structures serve different purposes. Bat wings and birg wings are also homologous, while they are both wings, the forelimbs serve different purposes. Insects and dogs are very far ancestors since one is vertebrate while the other is invertebrate and the forelimbs serve the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of the following is not known to be involved in the control of cell division?\n(A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think step by step. Normal cells move through the cell cycle in a regulated way. At the checkpoint stage, they use information about their own internal state and cues from the environment around them to decide whether to proceed with cell division. Cues like these act by changing the activity of core cell cycle regulators inside the cell. The most common regulators are cyclins and cyclin-dependent kinases. Fibroblast cells do not play any role in cell division. The answer is (D).", "high_school_chemistry": "The following are multiple choice questions (with answers) about high school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n(A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride is a compound that is derived by removing water from an acid. The chemical formula for water is H2O, which means that we need to determine which of these options, when combined with H2O, forms an acid. SO2, or Sulfur dioxide, when combined with H2O, makes H2SO4, or sulfuric acid. The answer is (C).\n\nQ: Which of the following is expected to be a polar molecule?\n(A) PCl4F (B) BF3 (C) CO2 (D) Si(CH3)4\nA: Let's think step by step. A polar molecule is one that has a slightly positive charge on one end of the molecule and a slightly negative charge on the other end. Boron trifluoride (BF3) has Boron as the center atom and three fluorine atoms attached to it; it is trigonal planar and symmetric, so it is nonpolar. Carbon Dioxide (CO2) has Carbon as the central atom with double bonds to two Oxygen atoms - this is also symmetrical and therefore nonpolar. The same is the case for tetramethyl silane (SI(CH3)4), which is a Silicon atom surrounded by four methyl groups. The structure of PCL4F is that Phosphorus is the central atom, attached to four chlorines and one fluorine atom. This is asymmetrical, and therefore has a net dipole and is expected to be a polar molecule. The answer is (A).\n\nQ: From the solubility rules, which of the following is true?\n(A) All chlorides, bromides, and iodides are soluble (B) All sulfates are soluble (C) All hydroxides are soluble (D) All ammonium-containing compounds are soluble\nA: Let's think step by step. The chlorides, bromides, and iodides of lead, silver, and mercury are not soluble in water. This rules out (A). The sulfates of lead, barium, and calcium are not soluble in water, which rules out (B). The hydroxides of any metal besides sodium, potassium, ammonium, calcium, and barium are insoluble. This rules out (C). Typically ammonium ions indicate a soluble ionic substance. The answer is (D).\n\nQ: A new compound is synthesized and found to be a monoprotic acid with a molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500 L of water, the pH is measured as 3.89. What is the pKa of this acid?\n(A) 3.89 (B) 7.78 (C) 5.78 (D) 2.33\nA: Let's think step by step. Recall that $[A] = [H^{+}]$. Here, this is equal to $$10^{-3.89}$. Then we have $K_{a} = $\nrac{[H^{+}][A^{-}]}{[HA]} = \nrac{10^{-3.89} \\cdot 10^{-3.89}}{10^{-2}}. The resulting exponent is $-3.89 + (-3.89) - (-2) = 5.78$, therefore $K_a = 10^{-5.78}$. The $pK_a$ is the negative log of $K_a$, which is equal to $5.78$. The answer is (C).\n\nQ: A solution contains 2.00 mole of acetic acid, CH3COOH, and 1.00 mole of calcium acetate, Ca(CH3COO)2. The solution is able to resist the addition of a small amount of strong acid or strong base with only minor changes in the pH of the solution. Larger quantities of strong acid or strong base can cause a significant change in pH. How many moles of nitric acid, HNO3, may be added before the pH begins to change significantly?\n(A) 0.500 mole (B) 1.00 mole (C) 2.00 mole (D) 3.00 mole\nA: Let's think step by step. We would like to compute the buffer capacity of this solution. First we write the equation for the ionization of the weak acid, in this case of acetic acid. $CH_{3}COOH (aq) + H_{2}O \nightarrow H_{3}O^{+} + CH3COO^{-}$. The conjugate base is therefore the acetate ion. The added strong acid, Nitric acid, will react with the conjugate base. Therefore the maximum amount of acid that can be added will be equal to the amount of acetate ion, or 2 moles. The answer is (C).", "high_school_computer_science": "The following are multiple choice questions (with answers) about high school computer science.\n\nQ: Which of the following is an example of the use of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that it is about to hit an object. (B) A hiker uses a G P S watch to keep track of her position. (C) A refrigerator orders milk from an online delivery service when the milk in the refrigerator is almost gone. (D) A runner uses a watch with optical sensors to monitor his heart rate.\nA: Let's think step by step. The term Internet of Things (IoT) refers to common devices which are connected to the internet, enabling new functionality. Choice A is incorrect because it does not describe an internet connected device. In choice B, the watch is only described as having GPS functionality but no internet connectivity. Choice C describes a common device (a refrigerator) which has internet connectivity enabling new functionality (online ordering). Choice D does not mention internet connectivity for the watch, only optical sensors. The answer is (C).\n\nQ: Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?\n(A) The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator. (B) Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer. (C) A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session. (D) A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.\nA: Let's think step by step. Choice A is incorrect as it only describes network traffic, which an anonymous browser does not change. Choice B is correct as it correctly describes how an anonymous browser will prevent saving data on the user\u2019s computer after the session is ended. Choice C is incorrect because an anonymous browser will not prevent logging in to email or social media accounts. Choice D is incorrect because an anonymous browser in itself performs no virus protection. The answer is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n  DISPLAY (\"Hotel\")\n } ELSE {\n  IF (Y > 0){\n   DISPLAY (\"November\")\n  } ELSE {\n   DISPLAY (\"Yankee\")\n  }\n }\n}\nWhat is displayed as a result of running the program?\n(A) Foxtrot (B) Hotel (C) November (D) Yankee\nA: Let's think step by step. Because X has the value 5, the first conditional IF (X < 0) is false, so we move to the first ELSE clause. Because X is 5 and Y is 10, the second conditional IF (X > Y) is false, so we move to the following ELSE clause. Since Y is 10, the conditional IF (Y > 0) is true, so the command DISPLAY (\"November\") is executed. The answer is (C).\n\nQ: What is the output of \"abc\"[::-1] in Python 3?\n(A) Error (B) abc (C) cba (D) c\nA: Let's think step by step. We know that the slicing operator [::-1] takes all of the elements in the string in reverse order, so we reverse the order of the string \"abc\", resulting in \"cba\". The answer is (C).\n\nQ: A list of numbers has n elements, indexed from 1 to n. The following algorithm is intended to display the number of elements in the list that have a value greater than 100. The algorithm uses the variables count and position. Steps 3 and 4 are missing.\n Step 1: Set count to 0 and position to 1.\n Step 2: If the value of the element at index position is greater than 100, increase the value of count by 1.\n Step 3: (missing step)\n Step 4: (missing step)\n Step 5: Display the value of count.\nWhich of the following could be used to replace steps 3 and 4 so that the algorithm works as intended?\n(A) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of count is greater than 100.\n(B) Step 3: Increase the value of position by 1.\n  Step 4: Repeat steps 2 and 3 until the value of position is greater than n.\n(C) Step 3: Repeat step 2 until the value of count is greater than 100.\n  Step 4: Increase the value of position by 1.\n(D) Step 3: Repeat step 2 until the value of position is greater than n.\n  Step 4: Increase the value of count by 1.\nA: Let's think step by step. Choice A is incorrect, because its Step 4 has an incorrect termination condition, stopping when count is greater than 100. We need to stop after inspecting all elements in the list. Choice B is correct because it correctly increments both count and position, and correctly repeats these steps and terminates when all elements in the list have been inspected. Choice C is incorrect because it incorrectly increments the variable count until its value is greater than 100, regardless of the elements in the list. Choice D is incorrect because its step 3 does not increment the value of position, so it will repeat forever. The answer is (B).", "high_school_european_history": "The following are multiple choice questions (with answers) about high school european history.\n\nQ: This question refers to the following information.\nAlbeit the king's Majesty justly and rightfully is and ought to be the supreme head of the Church of England, and so is recognized by the clergy of this realm in their convocations, yet nevertheless, for corroboration and confirmation thereof, and for increase of virtue in Christ's religion within this realm of England, and to repress and extirpate all errors, heresies, and other enormities and abuses heretofore used in the same, be it enacted, by authority of this present Parliament, that the king, our sovereign lord, his heirs and successors, kings of this realm, shall be taken, accepted, and reputed the only supreme head in earth of the Church of England, called Anglicans Ecclesia; and shall have and enjoy, annexed and united to the imperial crown of this realm, as well the title and style thereof, as all honors, dignities, preeminences, jurisdictions, privileges, authorities, immunities, profits, and commodities to the said dignity of the supreme head of the same Church belonging and appertaining; and that our said sovereign lord, his heirs and successors, kings of this realm, shall have full power and authority from time to time to visit, repress, redress, record, order, correct, restrain, and amend all such errors, heresies, abuses, offenses, contempts, and enormities, whatsoever they be, which by any manner of spiritual authority or jurisdiction ought or may lawfully be reformed, repressed, ordered, redressed, corrected, restrained, or amended, most to the pleasure of Almighty God, the increase of virtue in Christ's religion, and for the conservation of the peace, unity, and tranquility of this realm; any usage, foreign land, foreign authority, prescription, or any other thing or things to the contrary hereof notwithstanding.\nEnglish Parliament, Act of Supremacy, 1534\nFrom the passage, one may infer that the English Parliament wished to argue that the Act of Supremacy would\n(A) give the English king a new position of authority (B) give the position of head of the Church of England to Henry VIII alone and exclude his heirs (C) establish Calvinism as the one true theology in England (D) end various forms of corruption plaguing the Church in England\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Act of Supremacy states that it grants authority to the king \"to repress and extirpate all errors, heresies, and other enormities and abuses\", referring to the corruption in the Church of England. The answer is (D).\n\nQ: This question refers to the following information.\nRead the following excerpt.\nThe revolutionary seed had penetrated into every country and spread more or less. It was greatly developed under the r\u00e9gime of the military despotism of Bonaparte. His conquests displaced a number of laws, institutions, and customs; broke through bonds sacred among all nations, strong enough to resist time itself; which is more than can be said of certain benefits conferred by these innovators.\nThe monarchs will fulfil the duties imposed upon them by Him who, by entrusting them with power, has charged them to watch over the maintenance of justice, and the rights of all, to avoid the paths of error, and tread firmly in the way of truth. Placed beyond the passions which agitate society, it is in days of trial chiefly that they are called upon to despoil realities of their false appearances, and to show themselves as they are, fathers invested with the authority belonging by right to the heads of families, to prove that, in days of mourning, they know how to be just, wise, and therefore strong, and that they will not abandon the people whom they ought to govern to be the sport of factions, to error and its consequences, which must involve the loss of society.\nUnion between the monarchs is the basis of the policy which must now be followed to save society from total ruin. . . .\nLet them not confound concessions made to parties with the good they ought to do for their people, in modifying, according to their recognized needs, such branches of the administration as require it.\nLet them be just, but strong; beneficent, but strict.\nLet them maintain religious principles in all their purity, and not allow the faith to be attacked and morality interpreted according to the social contract or the visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\nWhich of the following was the greatest cause of the fears expressed by Metternich in the document above?\n(A) The ideas of personal liberty and nationalism conceived during the Enlightenment resulted in radical revolutions that could spread throughout Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions and shifted the European balance of power. (C) The power of monarchs had grown to the point where it needed to be checked by other powers within each nation or domination of civilians would occur. (D) The rising and falling economic cycle of the newly emerging capitalist economy could lead to civilian unrest that must be suppressed.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The fears of revolution in early 19th century Europe expressed by Klemens von Metternich, a conservative Austrian statesman, were a direct result of the age of Enlightenment, a period of European history where the absolute power of the monarchy was challenged with ideas of individual liberty and nationalism, leading to the French revolution and its effects all over Europe. The answer is (A).\n\nQ: This question refers to the following information.\nThe excerpts below are from the Navigation Acts of 1651.\n[A]fter the first day of December, one thousand six hundred fifty and one, and from thence forwards, no goods or commodities whatsoever of the growth, production or manufacture of Asia, Africa or America, or of any part thereof; or of any islands belonging to them, or which are described or laid down in the usual maps or cards of those places, as well of the English plantations as others, shall be imported or brought into this Commonwealth of England, or into Ireland, or any other lands, islands, plantations, or territories to this Commonwealth belonging, or in their possession, in any other ship or ships, vessel or vessels whatsoever, but only in such as do truly and without fraud belong only to the people of this Commonwealth, or the plantations thereof, as the proprietors or right owners thereof; and whereof the master and mariners are also of the people of this Commonwealth, under the penalty of the forfeiture and loss of all the goods that shall be imported contrary to this act, , , ,\n[N]o goods or commodities of the growth, production, or manufacture of Europe, or of any part thereof, shall after the first day of December, one thousand six hundred fifty and one, be imported or brought into this Commonwealth of England, or any other lands or territories to this Commonwealth belonging, or in their possession, in any ship or ships, vessel or vessels whatsoever, but in such as do truly and without fraud belong only to the people of this Commonwealth, and in no other, except only such foreign ships and vessels as do truly and properly belong to the people of that country or place, of which the said goods are the growth, production or manufacture.\nWhich of the following best describes the outcome of the Navigation Acts of 1651?\n(A) They served as a catalyst for the growth of English shipping and overseas trade, but did little to limit the prospects of the Dutch in the seventeenth century. (B) They brought about almost immediate hardships for the Dutch economy as their dominance of overseas trade quickly ended. (C) They were rescinded during the restoration of the Stuarts as they sought normal diplomatic relations with the Dutch so not as to need Parliament's financial support for war. (D) They led to nearly a century of recurrent war between England and the Netherlands, which would not end until after American independence.\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The Navigation Acts of 1651 helped English shipping by restricting the ability of ships from other European countries, especially the Dutch, to transport goods from colonies in Asia and Africa into England. The answer is (A).\n\nQ: This question refers to the following information.\nIn Russia there was nothing going on well, and [Souvarine] was in despair over the news he had received. His old companions were all turning to the politicians; the famous Nihilists who made Europe tremble-sons of village priests, of the lower middle class, of tradesmen-could not rise above the idea of national liberation, and seemed to believe that the world would be delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get out of it with their foolery.\"\nThen, lowering his voice still more, in a few bitter words he described his old dream of fraternity. He had renounced his rank and his fortune; he had gone among workmen, only in the hope of seeing at last the foundation of a new society of labour in common. All the sous in his pockets had long gone to the urchins of the settlement; he had been as tender as a brother with the colliers, smiling at their suspicion, winning them over by his quiet workmanlike ways and his dislike of chattering. But decidedly the fusion had not taken place.\nHis voice changed, his eyes grew bright, he fixed them on \u00e9tienne, directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles who have won the great lottery prize of a hundred thousand francs have gone off at once and invested it, declaring that they are going to live without doing anything! Yes, that is your idea, all of you French workmen; you want to unearth a treasure in order to devour it alone afterwards in some lazy, selfish corner. You may cry out as much as you like against the rich, you haven't got courage enough to give back to the poor the money that luck brings you. You will never be worthy of happiness as long as you own anything, and your hatred of the bourgeois proceeds solely from an angry desire to be bourgeois yourselves in their place.\"\n\u00e9mile Zola, French writer, Germinal, 1885\nThe passage displays the direct concern for the welfare of the working classes that was typically a part of which movement?\n(A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. The modern Communist movement aims to establish a classless society based on communal ownership and distribution of property and means of production, thereby especially benefiting the working classes. The answer is (C).\n\nQ: This question refers to the following information.\nThe following excerpt is from a pamphlet.\nYou will do me the justice to remember, that I have always strenuously supported the Right of every man to his own opinion, however different that opinion might be to mine. He who denies to another this right, makes a slave of himself to his present opinion, because he precludes himself the right of changing it.\nThe most formidable weapon against errors of every kind is Reason. I have never used any other, and I trust I never shall.\nThe circumstance that has now taken place in France of the total abolition of the whole national order of priesthood, and of everything appertaining to compulsive systems of religion, and compulsive articles of faith, has not only precipitated my intention, but rendered a work of this kind exceedingly necessary, lest in the general wreck of superstition, of false systems of government, and false theology, we lose sight of morality, of humanity, and of the theology that is true.\nI believe in one God, and no more; and I hope for happiness beyond this life.\nI believe in the equality of man; and I believe that religious duties consist in doing justice, loving mercy, and endeavoring to make our fellow-creatures happy.\nI do not believe in the creed professed by the Jewish church, by the Roman church, by the Greek church, by the Turkish church, by the Protestant church, nor by any church that I know of. My own mind is my own church.\nAll national institutions of churches, whether Jewish, Christian or Turkish, appear to me no other than human inventions, set up to terrify and enslave mankind, and monopolize power and profit.\nI do not mean by this declaration to condemn those who believe otherwise; they have the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed a system of checks and balances for government to avoid abuses of power?\n(A) Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\nA: Let's think step by step. We refer to Wikipedia articles on european history for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively against the monoplization of power and advocated for a system of checks and balances in government to prevent the rise of despotism. The answer is (B).", "high_school_geography": "The following are multiple choice questions (with answers) about high school geography.\n\nQ: Which one of the following items is an example of nonmaterial culture?\n(A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not physical objects. The answer is (C).\n\nQ: During the third stage of the demographic transition model, which of the following is true?\n(A) Birth rates increase and population growth rate is less rapid. (B) Birth rates decline and population growth rate is less rapid. (C) Birth rates increase and population growth rate increases. (D) Birth rates decrease and population growth rate increases.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The demographic transition model models the five different stages of population growth as a country goes through economic development, where the third stage refers to a period of declining birth rates and lower population growth. The answer is (B).\n\nQ: The practice of hiring a foreign third-party service provider to run an operation is called\n(A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. \"Offshoring\" literally means to move or base some of the activities or processes of a company to a foreign country. The answer is (B).\n\nQ: Which of the following statements is NOT accurate regarding the services provided by local governments in the United States?\n(A) Duplication of efforts occurs often. (B) Social problems of the central city spill over into the surrounding residential suburbs. (C) Inefficiency in providing services occurs often. (D) One neighborhood's efforts to reduce pollution are always supported by neighboring communities.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. There may be economic, social or political reasons for two neighboring communities and their local governments not agreeing to pollution reduction efforts initiated by one of them. The answer is (D).\n\nQ: The rate of natural increase of a population is found by subtracting the\n(A) crude death rate from the crude birth date. (B) crude birth rate from the crude death rate. (C) doubling time from the crude birth rate. (D) fertility rate from the crude death rate.\nA: Let's think step by step. We refer to Wikipedia articles on geography for help. The difference between number of births and deaths gives the population increase at any given time. The answer is (A).", "high_school_government_and_politics": "The following are multiple choice questions (with answers) about high school government and politics.\n\nQ: Which of the following best states an argument made by James Madison in The Federalist number 10?\n(A) Honest politicians can prevent factions from developing. (B) Factions are more likely to occur in large republics than in small ones. (C) The negative effects of factionalism can be reduced by a republican government. (D) Free elections are the people's best defense against factionalism.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the Federalist number 10, James Madison advocated for a representative republican form of government to guard against factionalism. The answer is (C).\n\nQ: The term \"budget deficit\" refers to the\n(A) annual increase in federal spending on the military (B) amount of interest on the national debt (C) difference between the initial budget proposals made by the president and Congress (D) amount the government spends in excess of its revenues\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. When the goverment spends more than it earns, their difference is the budget deficit. The answer is (D).\n\nQ: Which of the following statements about cabinet departments is FALSE?\n(A) They are established by the legislative branch. (B) Their members often don't have much influence over presidential decisions. (C) They cannot all be run by leaders who belong to the same political party the president does. (D) Not every federal agency is a cabinet department.\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. There is no law stipulating that some cabinet department leaders have to belong to a political party different from that of the president. The answer is (C).\n\nQ: Which of the following cases established the precedent that a defendant must be informed of the right to remain silent, the right to a lawyer, and protection from self-incrimination?\n(A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant's right to an attorney and protection from self-incrimination. The answer is (D).\n\nQ: Uncertainty over the limits to presidential power is caused primarily by the fact that\n(A) the constitutional definition of those powers is broad and unspecific (B) most people agree that the Constitution places too many limits on presidential power (C) the Supreme Court consistently refuses to rule on cases concerning presidential powers (D) constitutional amendments have greatly increased presidential powers\nA: Let's think step by step. We refer to Wikipedia articles on government and politics for help. The US Constitution is not very specific about the powers of the president, leading to uncertainty over its limits. The answer is (A).", "high_school_macroeconomics": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\nQ: Which of the following policies best describes supply-side fiscal policy?\n(A) An increase in the money supply (B) Increased government spending (C) Lower taxes on research and development of new technology (D) Higher taxes on household income\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Supply-side fiscal policy stimulates the economy by encouraging more production of goods and services through reduction in taxes and deregulation. The answer is (C).\n\nQ: The short-run Phillips curve indicates a\n(A) direct relation between unemployment and inflation (B) direct relation between price and quantity demanded (C) inverse relation between price and quantity demanded (D) inverse relation between unemployment and inflation\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The short-run Phillips curve shows that whenever unemployment decreases below a natural level, the inflation starts increasing, and vice-versa. The answer is (D).\n\nQ: Holding all else equal which of the following monetary policies would be used to boost U.S. exports?\n(A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government securities (D) Lowering tariffs\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. Buying government securities leads to reduction in demand for US dollars from foreign buyers, thereby making it cheaper and hence making US exports more attractive. The answer is (C).\n\nQ: A federal deficit occurs when\n(A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections exceed spending. (D) federal spending exceeds federal tax revenues.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. A federal deficit occurs when federal spending exceeds federal income which is primarily from tax revenues. The answer is (D).\n\nQ: Which of the following is not included in the U.S. GDP?\n(A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel. (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An American pop singer performs a sold-out concert in Paris. (D) A French theatrical production tours dozens of American cities.\nA: Let's think step by step. We refer to Wikipedia articles on macroeconomics for help. The economic transactions related to the performance of the American pop-singer in Paris happens entirely outside the U.S. and hence is not included in the GDP numbers. The answer is (C).", "high_school_mathematics": "The following are multiple choice questions (with answers) about high school mathematics.\n\nQ: Simplify and write the result with a rational denominator: $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step. Factoring $729=3^6$ and combining the roots $\\frac{1}{2}\\frac{1}{3}\\frac{1}{2}=\\frac{1}{12}$, we get that $\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}=\\left(\\frac{1}{3^6}\\right)^{\\frac{1}{12}}=\\frac{1}{3^{\\frac{1}{2}}}=\\frac{3}{\\sqrt{3}}$ The answer is (D).\n\nQ: Five thousand dollars compounded annually at an $x\\%$ interest rate takes six years to double. At the same interest rate, how many years will it take $\\$300$ to grow to $\\$9600$?\n(A) 12 (B) 1 (C) 30 (D) 5\nA: Let's think step by step. To go from $\\$300$ to $\\$9600$, the value must go up by a factor of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\nQ: Ten students take a biology test and receive the following scores: 45, 55, 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n(A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85 = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies directly as the square of $y$, and $y$ varies directly as the cube of $z$. If $x$ equals $-16$ when $z$ equals 2, what is the value of $x$ when $z$ equals $\\frac{1}{2}$?\n(A) -1 (B) 16 (C) -\\frac{1}{256} (D) \\frac{1}{16}\nA: Let's think step by step. We know that $x \\propto y^2$ and $y \\propto z^3$, so $x = k z^6$ for some constant $k$. Plugging in for $x=-16$ and $z=2$, the constant value is $k=\\frac{x}{z^6}=\\frac{-16}{64}=-\\frac{1}{4}$. So, when $z=\\frac{1}{2}$, the value of $x$ is $x=kz^6=-\\frac{1}{4}\\frac{1}{2^6}=-\\frac{1}{256}$. The answer is (C).\n\nQ: Joe was in charge of lights for a dance. The red light blinks every two seconds, the yellow light every three seconds, and the blue light every five seconds. If we include the very beginning and very end of the dance, how many times during a seven minute dance will all the lights come on at the same time? (Assume that all three lights blink simultaneously at the very beginning of the dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights will come on at the same time $2*7+1=15$ times. The answer is (B).", "high_school_microeconomics": "The following are multiple choice questions (with answers) about high school microeconomics.\n\nQ: Which of the following is necessarily a characteristic of oligopoly?\n(A) Free entry into and exit from the market (B) A few large producers (C) One producer of a good with no close substitutes (D) A homogenous product\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An oligopoly is when a market is dominated by just one or a few number of sellers or producers. To get oligopoly, the market should have high barriers to new entry, and the product has differentiation. The answer is (B).\n\nQ: If the government subsidizes producers in a perfectly competitive market, then\n(A) the demand for the product will increase (B) the demand for the product will decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. (A) and (B) are wrong because the demand curve does not change at all. If the government subsidizes producers, the supply will increase, and thus the consumer surplus also increases. The answer is (C).\n\nQ: Which of the following is true of a price floor?\n(A) The price floor shifts the demand curve to the left. (B) An effective floor creates a shortage of the good. (C) The price floor shifts the supply curve of the good to the right. (D) To be an effective floor, it must be set above the equilibrium price.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. Price floor does not shift the demand or shift curve. An effective price floor should be set above the equilibrium price, otherwise the market bears and the floor does not have effective effect. The answer is (D).\n\nQ: The concentration ratio for a monopoly is\n(A) 0 (B) 5 (C) 10 (D) 100\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. The concentration ratio is calculated as the sum of market share of a specific number of largest companies. Monopoly means one company or entity controls the entire market, therefore, the concentration ratio is 100 percent. The answer is (D).\n\nQ: In a competitive labor market for housepainters, which of the following would increase the demand for housepainters?\n(A) An effective minimum wage imposed on this labor market. (B) An increase in the price of gallons of paint. (C) An increase in the construction of new houses. (D) An increase in the price of mechanical painters so long as the output effect exceeds the substitution effect.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics for help. An increase in the construction of new houses means an increase demand of in-house painting, thus increases the demand for housepainters. The answer is (C).", "high_school_physics": "The following are multiple choice questions (with answers) about high school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven?\n(A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy usage is known as power; in an dissipative electrical circuit, power is given by voltage times current. So in our case, the power is 120 V times 2 amps, or 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin. How much work is required to move a charge, Q = +8 \u00b5C, from the point (0, 4 meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\nA: Let's think step by step. To calculate the work required to move a charge from one location to another in a fixed electric field, it is enough to calculate the potential difference between the two locations. Here, the potential only depends on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following conditions will ensure that angular momentum is conserved? I. Conservation of linear momentum II. Zero net external force III. Zero net external torque\n(A) I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think step by step. Torque is defined as the change in angular momentum; if there is zero external torque, angular momentum is conserved. The answer is (D).\n\nQ: A photocell of work function \u03d5 = 2eV is connected to a resistor in series. Light of frequency f = 1 \u00d7 10^15 Hz hits a metal plate of the photocell. If the power of the light is P = 100 W, what is the current through the resistor?\n(A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The only answer above which has units of current is D, 24 A. The answer is (D).\n\nQ: A pipe full of air is closed at one end. A standing wave is produced in the pipe, causing the pipe to sound a note. Which of the following is a correct statement about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure is at a node, but the particle displacement is at an antinode. (B) The pressure is at an antinode, but the particle displacement is at a node. (C) The pressure and the particle displacement are both at nodes. (D) The pressure and the particle displacement are both at antinodes.\nA: Let's think step by step. At the closed end of the pipe, the particles cannot have any net displacement because the pipe closure stops them. So the particle displacement is at a node. This closure also causes the pressure to be maximal, i.e. an antinode. The answer is (B).", "high_school_psychology": "The following are multiple choice questions (with answers) about high school psychology.\n\nQ: Pascale is interested in the processing strategies children use to learn new information. Pascale would best be classified as what type of psychologist?\n(A) sociocultural (B) clinical (C) cognitive (D) behaviorist\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Sociocultural psychologist focuses on the effect of societal factors on people. Clinical psychologist focuses on people with mental issues. Cognitive psychologist focuses on how people think and learn, including the processing strategies. Behaviorist focuses more on the environment and experience effect on people. The answer is (C).\n\nQ: According to Caplan's model of consultee-centered case consultation, the consultant is primarily interested in\n(A) identifying the causes and solutions of the client's presenting problems (B) identifying and eliminating the causes of the consultee's difficulties in handling a problem (C) establishing a hierarchy of authority to enable effective decision making (D) presenting a single, well-defined and unambiguous course of action for the consultant to overcome skills deficits\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Caplan defines two type of consultation. Client-centered case consultation aims to handle client's problems, while consultee-centered case consultation aims to identify the reason of client's difficulty to solve problems. The answer is (B).\n\nQ: According to the Individuals with Disabilities Education Improvement Act, which of the following must an educational agency do before it changes the educational placement of a student with a disability?\n(A) Give the child a trial period in the new environment (B) Notify the parents in writing (C) Obtain school board approval (D) Obtain parental consent\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. When the decision to change the educational placement of a student with a disability is made, the educational agency must notify the parents in writing on that date. The answer is (B).\n\nQ: While swimming in the ocean, Ivan is frightened by a dark shadow in the water even before he has the chance to identify what the shadow is. The synaptic connections taking place during this incident of fright are best described by which of the following?\n(A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are sent from the thalamus to the \"what\" and \"where\" pathways. (C) Messages are sent from the parasympathetic nervous system to the cerebral cortex. (D) Messages are sent from the frontal lobes to the pituitary gland.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Our neural system has a mechanism that can respond immediate emotional signal before going to the thought center. In the Ivan's case, messages travel directly from thalamus to amygdala. The answer is (A).\n\nQ: Ani believes that her attitudes and behavior play a central role in what happens to her. Such a belief is likely to be associated with\n(A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal locus of control.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. People with an external locus of control believes fate and luck play an important role in their lives, while people with an internal locus of control believes they control their lives. The answer is (D).", "high_school_statistics": "The following are multiple choice questions (with answers) about high school statistics.\n\nQ: A new smartwatch is manufactured in one part of a factory, then secured for shipping in another, independent part of the factory. The weight of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging would have the following mean and standard deviation:\n(A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation 3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard deviation 6.1 grams\nA: Let's think step by step. Since the weight of the watch and the weight of the packaging are independent random variables, the mean and variance of their sum is equal to the sum of their individual means and variances. So the mean is 62 + 456 = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation of 6.1 grams. The answer is (C).\n\nQ: After a frost warning was issued, the owner of a large orange grove asked his workers to spray all his trees with water. The water was supposed to freeze and form a protective covering of ice around the orange blossom. Nevertheless, the owner suspected that some trees suffered considerable damage due to the frost. To estimate the proportion of trees that suffered more than 50 percent damage due to the frost, he took a random sample of 100 trees from his grove. What is the response variable in this experiment?\n(A) The proportion of trees that suffered more than 50 percent damage due to frost. (B) The number of trees affected by the frost. (C) The number of trees sampled from the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage or at most 50 percent damage.\nA: Let's think step by step. In this experiment, the response variable is what is measured. For each tree, what is measured is whether or not it suffered more than 50 percent damage due to the frost. The answer is (D).\n\nQ: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What are the expected value and variance of the random variable X + Y?\n(A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X + Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.\nA: Let's think step by step. While means of sums of random variables add (regardless of whether the variables are independent) in order to determine the variance of a sum of random variables, we need to know not just their individual variances but the covariance of the two variables, which is not given in this problem. The answer is (D).\n\nQ: Which of the following sets has the smallest standard deviation? Which has the largest?\nI: {1,2,3}\nII: {-10,10}\nIII: {100}\n(A) I, II (B) II, III (C) III, I (D) III, II\nA: Let's think step by step. The variance of distribution I is the expected squared deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution II is 10^2 (because both elements are 10 away from the mean of zero). The variance of distribution III is 0, since it has a single entry. So distribution III has the smallest standard deviation and distribution II has the largest. The answer is (D).\n\nQ: Which of the following is a correct statement about correlation?\n(A) If the slope of the regression line is exactly 1, then the correlation is exactly 1. (B) If the correlation is 0, then the slope of the regression line is undefined. (C) Switching which variable is called x and which is called y changes the sign of the correlation. (D) The correlation r is equal to the slope of the regression line when z-scores for the y-variable are plotted against z-scores for the x-variable.\nA: Let's think step by step. Statement A is false because the slope of the regression line being exactly 1 can occur even when the two variables are not perfectly correlated. Statement B is false because uncorrelated variables regression lines can have slope zero. Statement C is false because correlation is symmetric in the two random variables. The answer is (D).", "high_school_us_history": "The following are multiple choice questions (with answers) about high school us history.\n\nQ: This question refers to the following information.\nI come not to urge personal claims, nor to seek individual benefits; I appear as the advocate of those who cannot plead their own cause; I come as the friend of those who are deserted, oppressed, and desolate. In the Providence of God, I am the voice of the maniac whose piercing cries from the dreary dungeons of your jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses. I am the Revelation of hundreds of wailing, suffering creatures, hidden in your private dwellings, and in pens and cabins\u2014shut out, cut off from all healing influences, from all mind-restoring cares.\u2026 Could their melancholy histories be spread before you as revealed to my grieved spirit during the last three months, how promptly, how earnestly would you search out the most approved means of relief; how trifling, how insignificant, by comparison, would appear the sacrifices you are asked to make; how would a few dimes and dollars, gathered from each citizen, diminish in value as a possession, compared with the certain benefits and vast good to be secured for the suffering insane...by the consecration and application of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\nDorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C) Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Both Dorothea Dix and Clara barton are American nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation & a great diversity of Sentiments in this and neighboring Governments; some vindicating & others condemning it; some charitably alleviating the Crime, & others maliciously painting it in the most odious & detestable Colours, we think it our duty to lay before the Publick, the whole Matter as it appeared, & still appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an utter insensibility to human Distresses, let us consider a few more recent Facts. When we found the last Summer that we were likely to get no Assistance from the Government, some Volunteers went out at our own Expense, determined to drive our Enemies from our Borders; & when we came near to the great Island, we understood that a Number of their Warriors had gone out against our Frontiers. Upon this we returned and came up with them and fought with them at the Munfey Hill where we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers from this Story in another Expedition. But no sooner had we destroyed their Provisions on the great Island, & ruined their trade with the good People at Bethlehem, but these very Indians, who were justly suspected of having murdered our Friends in Northampton County, were by the Influence of some Quakers taken under the Protection of the Government to screen them from the Resentments of the Friends and Relations of the Murdered, & to support them thro the Winter.\"\n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this context should be read as an explanation, not an admission of guilt or regret.\nThe sentiments expressed in the explanation above reflect which of the ongoing tensions during the colonial period of American history?\n(A) Tensions between British policies and the aspirations of North American colonists. (B) Tensions between American Indians allied with the French and those allied with the British. (C) Tensions between freed African Americans and white planters. (D) Tensions between backcountry settlers and elites within colonial America.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. After the French and Indian War, the Scotch-Irish settlers attacked American Indians. After the attacks on the Conestoga, about 250 Paxton Boys present their grievances to the Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited resentiment at local elites. The answer is (D).\n\nQ: This question refers to the following information.\nOur leaders talk about stopping aggression from the north, but this was a struggle among groups of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill them and demolish their country to do it. As the native people survey bombed-out villages, women and children burned by napalm, rice crops destroyed and cities overrun with our military personnel, they are doubtless saying secretly of the Vietcong guerillas and of the American forces, \"A plague on both your houses.\" \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps, and confine our military action to holding operations on the ground. Bombing the north has failed to halt or seriously check the flow of troops to the south and may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following opinions from the 1960s most directly reflects the perspective of George McGovern's speech?\n(A) Americans must maximize their technological edge in Vietnam. (B) American bombing in Vietnam is step by step leading to progress in the war. (C) American bombing in Vietnam is a failure. (D) America must not give in to defeatism about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. \"Stop the bombing\" and \"Bombing the north has failed to halt or seriously check the flow of troops to the south\" indicate that the perspective of George McGovern's speech is that Amerian bombing in Vietnam is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\"In the new Code of Laws which I suppose it will be necessary for you to make I desire you would Remember the Ladies, and be more generous and favorable to them than your ancestors. Do not put such unlimited power into the hands of the Husbands. Remember all Men would be tyrants if they could. If particular care and attention is not paid to the Ladies we are determined to foment a Rebellion, and will not hold ourselves bound by any Laws in which we have no voice, or Representation.\"\nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman has placed us in a most anomalous position. Women invested with the rights of citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary line, are subjects in the next. In some States, a married woman may hold property and transact business in her own name; in others, her earnings belong to her husband. In some States, a woman may testify against her husband, sue and be sued in the courts; in others, she has no redress in case of damage to person, property, or character. In case of divorce on account of adultery in the husband, the innocent wife is held to possess no right to children or property, unless by special decree of the court. But in no State of the Union has the wife the right to her own person, or to any part of the joint earnings of the co-partnership during the life of her husband. In some States women may enter the law schools and practice in the courts; in others they are forbidden. In some universities girls enjoy equal educational advantages with boys, while many of the proudest institutions in the land deny them admittance, though the sons of China, Japan and Africa are welcomed there. But the privileges already granted in the several States are by no means secure.\"\nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments expressed in the second excerpt by Susan B. Anthony are most likely in support of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D) prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. The above information mentioned that women are in an anomalous position in terms of legislation. Women's earnings do not belong to themselves, or they cannot testify against her husbands. Susan believes women should have equal legal rights as men. The answer is (B).\n\nQ: This question refers to the following information.\n\"Society in every state is a blessing, but government even in its best state is but a necessary evil; in its worst state an intolerable one; for when we suffer, or are exposed to the same miseries by a government, which we might expect in a country without government, our calamity is heightened by reflecting that we furnish the means by which we suffer. Government, like dress, is the badge of lost innocence; the palaces of kings are built on the ruins of the bowers of paradise. For were the impulses of conscience clear, uniform, and irresistibly obeyed, man would need no other lawgiver; but that not being the case, he finds it necessary to surrender up a part of his property to furnish means for the protection of the rest; and this he is induced to do by the same prudence which in every other case advises him out of two evils to choose the least. Wherefore, security being the true design and end of government, it unanswerably follows that whatever form thereof appears most likely to ensure it to us, with the least expense and greatest benefit, is preferable to all others.\"\nThomas Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\nA: Let's think step by step. We refer to Wikipedia articles on us history for help. Anti-Federalists do not believe centralized government power, and suspect Washington's military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion happen before the Revolution and they can be ruled out. The answer is (C).", "high_school_world_history": "The following are multiple choice questions (with answers) about high school world history.\n\nQ: This question refers to the following information.\n\"At least one of the [world's] societies would have to somehow enormously increase its productivity [in order to achieve global hegemony]. That quantum jump would have to be made before the various scientific, technological, agricultural, and industrial revolutions on which our post-quantum-leap world rests. It could only be accomplished by exploiting the ecosystems, mineral resources, and human assets of whole continents outside the lands of the society making the jump. Western Europe did just that by means of its brutality and guns and, more important, by geographical and ecological luck.\"\nCopyright \u00a9 2015 Cambridge University Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\" mentioned in the passage most directly contributed to which of the following developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes through the collapse of the established state structure (B) An increase in the population of the world through more plentiful supplies of food (C) The spread of Chinese and Indian belief systems across the world (D) An increase in social unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The \"quantum jump\" mentioned in the passage refers to the conquest of the New World and the Columbian Exchange. Choice (A) and (C) did not happen in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This question refers to the following information.\n\"The struggle against neo-colonialism is not aimed at excluding the capital of the developed world from operating in less developed countries. It is aimed at preventing the financial power of the developed countries being used in such a way as to impoverish the less developed.\nNon-alignment, as practiced by Ghana and many other countries, is based on co-operation with all States whether they be capitalist, socialist or have a mixed economy. Such a policy, therefore, involves foreign investment from capitalist countries, but it must be invested in accordance with a national plan drawn up by the government of the non-aligned State with its own interests in mind. The issue is not what return the foreign investor receives on his investments\u2026The question is one of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage expresses a point that the successful fight against neo-colonialism were in danger and the newly independent nations like Ghana may be re-colonized via financial power of the developed countries. The answer is (B).\n\nQ: This question refers to the following information.\n\"Indeed, as both the fatwas of distinguished [scholars] who base their opinion on reason and tradition alike and the consensus of the Sunni community agree that the ancient obligation of extirpation, extermination, and expulsion of evil innovation must be the aim of our exalted aspiration, for \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces evil innovation into our order must be expelled\" and \"Whosoever does aught against our order must be expelled,\" action has become necessary and exigent\u2026\"\nLetter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from Selim I is most clearly an example of which of the following?\n(A) The maintenance of military supremacy at all costs (B) Expanding tensions between religious sects (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage is an example of expanding tensions between Selim and Ismail. In the passage the Selim references the fatwa and the consensus of the Sunni community to against whosoever introduces evil. The answer is (B).\n\nQ: This question refers to the following information.\n\"The real grievance of the worker is the insecurity of his existence; he is not sure that he will always have work, he is not sure that he will always be healthy, and he foresees that he will one day be old and unfit to work. If he falls into poverty, even if only through a prolonged illness, he is then completely helpless, exam_ins to his own devices, and society does not currently recognize any real obligation towards him beyond the usual help for the poor, even if he has been working all the time ever so faithfully and diligently. The usual help for the poor, however, leaves a lot to be desired, especially in large cities, where it is very much worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely made this speech in reaction to which of the following issues?\n(A) Social acceptance of child labor (B) Declining life expectancy in Germany (C) Criticisms of German trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. The passage talks about the grievance of the work under the industrial capitalism. The answer is (D).\n\nQ: This question refers to the following information.\nHe contains all works and desires and all perfumes and all tastes. He enfolds the whole universe and in silence is loving to all. This is the Spirit that is in my heart, this is Brahman. To him I shall come when I go beyond this life, and to him will come he who has faith and doubts not.\n\u2014The Upanishads, India, c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We refer to Wikipedia articles on world history for help. Brahman refers to the ultimate reality of all things in the Hindu religion. In contrast, Buddhism does not have a concept of supreme God. The answer is (A).", "human_aging": "The following are multiple choice questions (with answers) about human aging.\n\nQ: All other things being equal, which of the following persons is more likely to show osteoporosis?\n(A) An older Hispanic American woman (B) An older African American woman (C) An older Asian American woman (D) An older Native American woman\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Although osteoporosis can occur at any age, the risk is higher for older people. It is most common in Asian and non-Hispanic white women. The answer is (C).\n\nQ: The finding that adults tend to remember events from their adolescence better than from other periods in their lives is referred to as the\n(A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic retrieval spike\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Reminiscence bump is a phenomenon that older adults tend to recollect events during their young ages. People usually have a period of childhood amnesia from birth to around age 5, and a reminiscence bump between 10 and 30. The answer is (B).\n\nQ: Which element in tobacco smoke is responsible for cancers?\n(A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).\n\nQ: When older adults move to a new state after retirement, which of the following is the more likely destination?\n(A) Texas (B) California (C) Hawaii (D) Vermont\nA: Let's think step by step. We refer to Wikipedia articles on human aging for help. Texas does not have state tax, and has low cost of living compared with the other three options. The answer is (A).", "human_sexuality": "The following are multiple choice questions (with answers) about human sexuality.\n\nQ: The most common disorder among men who seek sexual therapy is:\n(A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory disorder\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The most common type of arousal disorder for men is erectile dysfunction, meaning that a person is not able to get any physical satisfaction from sexual activity although he may be interested in it. The answer is (C).\n\nQ: A woman who knows she has active herpes and untreated syphilis but continues to have sex without informing her partners of her condition has, in psychoanalytic terms:\n(A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. A person with weak superego tends to be delinquent, criminal or have antisocial personality. The action of the woman who knows she has active venereal disease but still have sex with her partners indicate she may has antisocial personality. The answer is (B).\n\nQ: The nature of homosexual activities that occur during preadolescence include all but which of the following?\n(A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other's genitals\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Sexual intercourse is prohibited during preadolescence. The answer is (A).\n\nQ: Women's ability to have multiple orgasms is primarily due to:\n(A) the fact that they do not have a refractory period. (B) the response of the inner layers of the vagina. (C) having alternating orgasms in different locations. (D) the G-Spot.\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. The refractory period is the time when a person is not able to be erect or is not interested in sex. The answer is (A).\n\nQ: Morning sickness is typically a problem:\n(A) during the first trimester (B) during the second trimester (C) during the third trimester (D) all through the pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on human sexuality for help. Morning sickness usually begins by nine weeks after conception, corresponding to the first trimester. The answer is (A).", "international_law": "The following are multiple choice questions (with answers) about international law.\n\nQ: How the consent to be bound of a State may be expressed?\n(A) The consent of a State to be bound is expressed only by ratification (B) The consent of a state to be bound by a treaty may be expressed by signature, ratification, acceptance, approval or accession (C) The consent of a State to be bound is expressed by signature (D) The consent of a State to be bound is expressed by whatever means they choose\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969 states that \"the consent of a State to be bound by a treaty may be expressed by signature, exchange of instruments constituting a treaty, ratification, acceptance, approval or accession, or by any other means if so agreed.\" (B) is the most precise and accurate answer. The answer is (B).\n\nQ: What is the judge ad hoc?\n(A) If a party to a contentious case before the ICJ does not have a national sitting as judge, it is entitled to nominate someone as a judge solely for that case, with the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified or passes away (D) Judge ad hoc is the judge that each party will always nominate in every contentious case\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. As \"ad hoc\" implies, a judge ad hoc is appointed only for a specific case or period, when a party to a contentious case before the International Court of Justice does not have a regular national sitting as judge. The answer is (A).\n\nQ: When 'consent' can serve as a circumstance precluding the wrongfulness of a State conduct?\n(A) Consent can serve as a circumstance precluding the wrongfulness whenever it is given (B) Consent can never serve as a circumstance precluding wrongfulness (C) Consent can serve as a circumstance precluding wrongfulness, provided the consent is valid and to the extent that the conduct remains within the limits of the consent given (D) Consent can always serve as a circumstance precluding wrongfulness, no matter which organ of the State gives it\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Valid consent can serve as a circumstance precluding the wrongfulness of a State conduct if the conduct remains within the limits of that consent, according to Chapter V of the Responsibility of States for Internationally Wrongful Acts, 2001, United Nations. The answer is (C).\n\nQ: Would a reservation to the definition of torture in the ICCPR be acceptable in contemporary practice?\n(A) This is an acceptable reservation if the reserving country's legislation employs a different definition (B) This is an unacceptable reservation because it contravenes the object and purpose of the ICCPR (C) This is an unacceptable reservation because the definition of torture in the ICCPR is consistent with customary international law (D) This is an acceptable reservation because under general international law States have the right to enter reservations to treaties\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable reservation in contemporary practice. The answer is (B).\n\nQ: What types of force does Article 2(4) of the UN Charter prohibit?\n(A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types of force, including sanctions (C) Article 2(4) encompasses all interference in the domestic affairs of States (D) Article 2(4) encompasses force directed only against a State's territorial integrity\nA: Let's think step by step. We refer to Wikipedia articles on international law for help. Article 2(4) of the UN Charter prohibits states from using armed forces in their international relations. The answer is (A).", "jurisprudence": "The following are multiple choice questions (with answers) about jurisprudence.\n\nQ: Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally fine self winding Rolox watch which we will sell to you at a very favorable price.'\n(A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C) The letter contains a valid offer which will terminate within a reasonable time. (D) The letter lacks one of the essential elements of an offer.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. An offer shows the intent to enter into a mutually-beneficial contract with specific terms. An offer can be made by a letter. While this letter indicates the willingness to sell, the lack of specific terms, such as transaction price and offer expiration date, makes it an incomplete offer. The answer is (D).\n\nQ: Functions of the law include all but which of the following?\n(A) maximizing individual freedom (B) providing a basis for compromise (C) keeping the peace (D) promoting the principles of the free enterprise system\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Laws are fundamentally about helping resolve disputes between individuals, and therefore essential for maximizing individual freedom, providing a basis for compromise, and keeping the peace. The answer is (D).\n\nQ: The ________ School of jurisprudence postulates that the law is based on what is \"correct.\"\n(A) Natural Law (B) Analytical (C) Historical (D) Sociological\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Natural Law School of jurisprudence focuses on the laws of nature, and states that the law should be based on ethics, morals, and what is \"correct\". Analytical deals with the law as it already exists, Historical postulates that the law was found and not made, and Sociological studies how the law and society impact each other. The answer is (A).\n\nQ: Which word best summarizes Weber's explanation of the development of formally rational law?\n(A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. Weber explained the development of formal rationality in laws as how the modern society moved from tradition to rationality, where people decide actions based less on how they were culturally done and more on expected utilities. How rational individuals optimize efficiency of accomplishing tasks for higher rewards is a core principle of Capitalism. The answer is (D).\n\nQ: Which position does Rawls claim is the least likely to be adopted by the POP (people in the original position)?\n(A) The POP would choose equality above liberty. (B) The POP would opt for the 'maximin' strategy. (C) The POP would opt for the 'difference principle'. (D) The POP would reject the 'system of natural liberty.'\nA: Let's think step by step. We refer to Wikipedia articles on jurisprudence for help. The POP would opt for the 'maximin' strategy, opt for the 'difference principle', and reject the 'system of natural liberty', but the POP would not choose equality above liberty, since the POP assume both equal and free citizens. The answer is (A).", "logical_fallacies": "The following are multiple choice questions (with answers) about logical fallacies.\n\nQ: When an arguer causes confusion during refutation because of real or feigned lack of an ability to engage in refutation, that arguer may have committed the fallacy of\n(A) poor sportsmanship (B) appeal to compassion (C) argument against the person (D) ignorance of refutation\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Ignorance of refutation, one of Aristotle's original list of logical fallacies in his Organon, is when someone causes confusion in an argument through real or feigned inability to engage in refutation, in order to win the argument. The answer is (D).\n\nQ: The complex question fallacy consists of\n(A) arguing something is inferior just because it doesn't do something it was never intended to do. (B) including more than one claim in the proposition and treating proof for one claim as proof for all the claims. (C) drawing a conclusion before examining the evidence, and only considering evidence that supports that conclusion. (D) asking a question that includes either an unproven assumption or more than one question, thus making a straightforward yes or no answer meaningless.\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The complex question fallacy is when someone makes a single yes or no answer to a question meaningless, by including either an unproven assumption or many questions. The latter is also known as the many questions fallacy. The answer is (D).\n\nQ: Arguing that what is true of the parts must be true of the whole is the fallacy of...\n(A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. Fallacy of composition occurs when someone argues what is true of the parts must be true of the whole. The answer is (B).\n\nQ: Which of the following is true of a valid categorical syllogism?\n(A) The minor premise must deny the antecedent (B) The major premise must affirm the consequent (C) The middle term must be used in at least one premise in a universal or unqualified sense (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. A valid categorical syllogism must satisfy several conditions: (1) the syllogism must have exactly three terms (2) every term of the syllogism must be used twice exactly, (3) a term may be used only once in any premise, and (4) the middle term must be used in at least one premise in a universal or unqualified sense, etc. Only (C) is true. The answer is (C).\n\nQ: If someone attacks the character of an opposing arguer, instead of responding to that opponent's arguments, the first person has probably committed which of the following fallacies?\n(A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi\nA: Let's think step by step. We refer to Wikipedia articles on logical fallacies for help. The argument against the person fallacy occurs when someone irrelevantly attacks the character of an opposing arguer, instead of addressing that opponent's arguments. The answer is (C).", "machine_learning": "The following are multiple choice questions (with answers) about machine learning.\n\nQ: Which image data augmentation is most common for natural images?\n(A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization (D) dithering\nA: Let's think step by step. Data augmentation is used to increase the diversity of images in the training dataset. It is important that natural images are kept natural after being augmented. Vertical flips of images are not natural, so (B) is false. Posterization makes the image look like a poster and and dithering increases color depth. None of these two preserve the natural property. The only natural data augmentation technique is (A). The answer is (A).\n\nQ: Traditionally, when we have a real-valued input attribute during decision-tree learning we consider a binary split according to whether the attribute is above or below some threshold. Pat suggests that instead we should just have a multiway split with one branch for each of the distinct values of the attribute. From the list below choose the single biggest problem with Pat\u2019s suggestion:\n(A) It is too computationally expensive. (B) It would probably result in a decision tree that scores badly on the training set and a testset. (C) It would probably result in a decision tree that scores well on the training set but badly on a testset. (D) It would probably result in a decision tree that scores well on a testset but badly on a training set.\nA: Let's think step by step. Because the input is real valued, it is unlikely that the same values appear both at training and test time. This means that while such a decision tree could yield good performance on the training data, when evaluated on the test data it will perform badly because the decision tree won\u2019t know what to do with numbers that did not appear in the training data. The answer is (C).\n\nQ: You are reviewing papers for the World\u2019s Fanciest Machine Learning Conference, and you see submissions with the following claims. Which ones would you consider accepting?\n(A) My method achieves a training error lower than all previous methods! (B) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise test error.) (C) My method achieves a test error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.) (D) My method achieves a cross-validation error lower than all previous methods! (Footnote: When regularisation parameter \u03bb is chosen so as to minimise cross-validaton error.)\nA: Let's think step by step. In machine learning, we train with some data and fixed hyperparameters and the training error can be arbitrarily low, so (A) can\u2019t be right. Then, one compares different hyperparameters by selecting the model with the lowest cross-validation error, this means that (B) and (D) are not the right procedure. The only relevant number after these is the test error and thus (C) is the right answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times; side 6: 5 times. Based on these results, what is the probability of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one to the different counts and then normalizes the probabilities accordingly. The counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7. The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality the IID test set must have how many examples?\n(A) around 10 examples (B) around 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA: Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with 95% probability the in-sample and out-of-sample errors differ by epsilon when we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2) log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than 1000 examples. The answer is (D).", "management": "The following are multiple choice questions (with answers) about management.\n\nQ: How can organisational structures that are characterised by democratic and inclusive styles of management be described?\n(A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Flat organizational structures are characterized by democratic and inclusive styles of management, and have few (if any) levels of management between the workers and managers.  The answer is (C).\n\nQ: Hygiene factors are associated with which writer?\n(A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor\nA: Let's think step by step. We refer to Wikipedia articles on management for help. Hygiene factors include compensation, company policies, supervision, interpersonal relations, and work environments. Hertzberg lists them as factors that cannot motivate employees but can minimize job dissatisfaction. The answer is (A).\n\nQ: What characteristic is not a key feature of the 'open systems' model of management?\n(A) Morale (B) Innovation (C) Growth resource (D) Adaptation\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The key characteristics of an open system in management include innovation, growth resource, and adaption, but do not include morale. The answer is (A).\n\nQ: Which element of the cultural web forms regalia?\n(A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The cultural web is a tool for mapping an organization's culture, where symbols form the regalia that visually expresses the values that the organization holds as important. The answer is (A).\n\nQ: What are the two main dimensions of the Ohio Studies into leadership?\n(A) Starting position and end position (B) Initial environment and changed environment (C) Organisational structure and conditioning (D) Initiating structure and considerations\nA: Let's think step by step. We refer to Wikipedia articles on management for help. The Ohio State Leadership Studies conducted in the 1940s identified initiating structure and consideration as the two main dimensions of leader behavior. The answer is (D).", "marketing": "The following are multiple choice questions (with answers) about marketing.\n\nQ: Although the content and quality can be as controlled as direct mail, response rates of this medium are lower because of the lack of a personal address mechanism. This media format is known as:\n(A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Door to door marketing delivers non-addressed items within all buildings within a geographic area. While it can control the content and quality as well as direct mail marketing, its response rate is lower because of the lack of a personal address mechanism. The answer is (D).\n\nQ: In an organization, the group of people tasked with buying decisions is referred to as the _______________.\n(A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making unit.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. In an organization, the group of the people tasked with buying decision is referred to as the decision-making unit. The answer is (D).\n\nQ: The single group within society that is most vulnerable to reference group influence is:\n(A) The older consumer who feels somewhat left out of things. (B) The married women, many of whom feel a need for stability in their lives. (C) New immigrants who really want to assimilate into their new culture. (D) Children, who base most of their buying decisions on outside influences.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Children, who mostly based their buying decisions on outside influences, are the single group within society that is more vulnerable to reference group influence. The answer is (D).\n\nQ: Which of the following is an assumption in Maslow's hierarchy of needs?\n(A) Needs are dependent on culture and also on social class. (B) Lower-level needs must be at least partially satisfied before higher needs can affect behaviour. (C) Needs are not prioritized or arranged in any particular order. (D) Satisfied needs are motivators, and new needs emerge when current needs remain unmet.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Maslow's hierarchy of needs, from the bottom upwards, are physiological (food and clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level needs must be at least partially satisfied before higher ones can affect behavior. The answer is (B).\n\nQ: _____________ is a natural outcome when combining demographic and geographic variables.\n(A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.\nA: Let's think step by step. We refer to Wikipedia articles on marketing for help. Geodemographics is a natural outcome when combining demographic and geographic variables. The answer is (A).", "medical_genetics": "The following are multiple choice questions (with answers) about medical genetics.\n\nQ: The stage of meiosis in which chromosomes pair and cross over is:\n(A) prophase I (B) metaphase I (C) prophase II (D) metaphase II\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Prophase I is the stage of meiosis where homologous chromosomes pair with each other and exchange genetic material. The answer is (A).\n\nQ: DNA ligase is\n(A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription of specific genes (D) an enzyme which limits the level to which a particular nutrient reaches\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA strands together by catalyzing a phosphodiester bond. The answer is (A).\n\nQ: Which of the following conditions does not show multifactorial inheritance?\n(A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D) Marfan syndrome\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Multifactorial inheritance is when more than a single factor is responsible for causing a given trait or health problem. Genes cannot be the only factor. Marfan syndrome, on the other hand, requires only one abnormal copy of the of the Marfan gene, from one parent, to inherit the trait. The answer is (D).\n\nQ: A gene showing codominance\n(A) has both alleles independently expressed in the heterozygote (B) has one allele dominant to the other (C) has alleles tightly linked on the same chromosome (D) has alleles expressed at the same time in development\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance where the phenotype of both the parents is easily observed in the offspring. A heterozygote is an individual having two different alleles of a gene. The answer is (A).\n\nQ: Large triplet repeat expansions can be detected by:\n(A) polymerase chain reaction. (B) single strand conformational polymorphism analysis. (C) Southern blotting. (D) Western blotting.\nA: Let's think step by step. We refer to Wikipedia articles on medical genetics for help. A Southern blot is a method in molecular biology for detecting specific DNA sequences in a sample. Large triplet repeat expansions are usually detected with this method. The answer is (C).", "miscellaneous": "The following are multiple choice questions (with answers) about miscellaneous.\n\nQ: Which of these songs was a Top 10 hit for the rock band The Police?\n(A) 'Radio Ga-Ga' (B) 'Ob-la-di Ob-la-da' (C) 'De Do Do Do De Da Da Da' (D) 'In-a-Gadda-Da-Vida'\nA: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly. Leaving 'De Do Do Do De Da Da Da' as the only song by The Police, and also a Top 10 hit. The answer is (C).\n\nQ: What place is named in the title of the 1979 live album by rock legends Cheap Trick?\n(A) Budapest (B) Budokan (C) Bhutan (D) Britain\nA: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album. The answer is (B).\n\nQ: What is produced during photosynthesis?\n(A) hydrogen (B) nylon (C) oxygen (D) light\nA: Let's think step by step. We refer to Wikipedia for help. Photosynthesis is the process in which green plants use the green pigment chlorophyll to synthesize foods with water and carbon dioxide. Oxygen is the byproduct of this process. The answer is (C).\n\nQ: Who is the shortest man to ever win an NBA slam dunk competition?\n(A) Anthony 'Spud' Webb (B) Michael 'Air' Jordan (C) Tyrone 'Muggsy' Bogues (D) Julius 'Dr J' Erving\nA: Let's think step by step. We refer to Wikipedia for help. In 1986, Spud Webb, standing only 5'7\" became the shortest NBA player in history to win an official slam dunk contest. The answer is (A).\n\nQ: How many axles does a standard automobile have?\n(A) one (B) two (C) four (D) eight\nA: Let's think step by step. We refer to Wikipedia for help. Most cars have two axles to rotate the wheels.. The answer is (B).", "moral_disputes": "The following are multiple choice questions (with answers) about moral disputes.\n\nQ: Baron admits that the versions of the ticking bomb hypothetical she discusses are \"stunningly stupid,\" but she claims this is actually evidence of\n(A) the stupidity of most traditional philosophical examples. (B) a general lack of intelligence among people with advanced degrees. (C) the wrongness of torture. (D) the readiness on the part of many intelligent people to see torture as the best solution to deal with terrorism.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. The ticking bomb hypothetical poses a problem where many people will die to an exploding bomb, if the hypothetical terrorist does not disclose how to defuse it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent people, particularly utilitarians, see torture as justifiable to save the lives in this scenario. The answer is (D).\n\nQ: A fertilized ovum is also known as\n(A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Once a single sperm penetrates the layers of an egg to form a new cell, that cell is called a zygote. The answer is (A).\n\nQ: Pence compares six different cases of reproduction, from natural twinning to SCNT. What conclusion does he draw from this comparison?\n(A) SCNT is not a different kind of reproduction because there are no morally relevant differences between it and other permissible means of reproduction. (B) Because there is a low risk of harm for natural twinning, there will be a low risk of harm for SCNT. (C) both A and B (D) neither A nor B\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. SCNT, known as 'somatic cell nuclear transfer', is a technique in which the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated egg. This is used in farm animal cloning. Pence may compare this to other processes of reproduction, and see little moral separation. The answer is (A).\n\nQ: Which of the following is an intrinsic good, according to Singer?\n(A) being in the upper class (B) the ability to learn and solve problems (C) being more beautiful than most of the population (D) being shorter than average\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable criteria, but capabilities are intrinsically good. The answer is (B).\n\nQ: According to Metz, what is wrong with consequentialist arguments against capital punishment based on African values?\n(A) It is unclear as of yet whether or not capital punishment deters harm to the community. (B) It is unclear as of yet whether or not capital punishment deters harm to any individuals. (C) Consequentialism is not supported by African values. (D) Even though consequentialism is supported by African values, no consequentialist arguments framed in terms of African values have been offered.\nA: Let's think step by step. We refer to Wikipedia articles on moral disputes for help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy in South Africa. Metz has written the death penalty is unjustified as it treats individuals as incapable of communal relations. It is unclear that capital punishment is to the benefit of, or a deterrent of harm to the community. The answer is (A).", "moral_scenarios": "The following are multiple choice questions (with answers) about moral scenarios.\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario 2 | I made love to my newborn baby boy.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Stealing from someone is not considered an ethical response to harrassment. And having sex with a minor is egregiously unethical. The answer is (A).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario 2 | When the security guard left his post, I pocketed a rare coin.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Asking for something is not morally wrong. Stealing is morally wrong. The answer is (C).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't care about her racial background.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Physical violence is morally wrong. Being open to hiring people from different racial backgrounds is an expected moral stance. The answer is (B).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a rock at it.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Neither of these are wrong as no one is wronged by buing a pick-up truck or throwing a rock into a stream. The answer is (D).\n\nQ: For which of these two scenarios does the main character (who uses I/me/my) do something clearly morally wrong, according to ordinary moral standards in the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new girl is embarrassed to live in a trailer park, so I invite the class to her house.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral scenarios for help. Loving someone is not wrong. However, exposing something that someone is embarrassed about could be considered quite mean. The answer is (C).", "nutrition": "The following are multiple choice questions (with answers) about nutrition.\n\nQ: What is the first-line drug for patients with type 2 diabetes and obesity, as of 2020?\n(A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed for type 2 diabetes, as well as obesity. It works by lowering glucose production in the liver and improving the body's sensitivity to insulin. The answer is (B).\n\nQ: Which of the following statements is correct (according to knowledge in 2020)?\n(A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame K\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine. As it builds up in the blood and brain it can lead to brain damage. People with PKU should avoid foods that are converted to phenylalanine in the body, such as aspartame. The answer is (A).\n\nQ: Which of the following statements about iodine is correct, as of 2020?\n(A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are a poor source of iodine (C) The iodine content of organic milk is generally lower that the level in non-organic milk (D) UK dietary reference values recommend an increase in iodine intake in pregnancy\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Organic milk usually has less iodine content than non-organic milk. The answer is (C).\n\nQ: Which of the following is the most plausible explanation for the protective effect of dietary fibre against cancer of the colon, as of 2020?\n(A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates \"silencing\" of the SLC5A8 tumour suppressor gene (C) None of these options are correct (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant defences in the colon\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. Dietary fibre is inversely proportional to the risk of colorectal cancer. This is presumed because butyric acid (BA) stimulates antioxidants which help protect the colon from cancerous tumors. The answer is (D).\n\nQ: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which one statement is correct (according to knowledge in 2020)?\n(A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers have higher risk of developing diabetes. (C) Based on the information given we cannot tell if the observed difference in disease risk is the result of chance. (D) The risk ratio is close to the value one, so there is no difference in disease risk between the two groups.\nA: Let's think step by step. We refer to Wikipedia articles on nutrition for help. The risk ratio is not sufficiently reduced that it could not be explained by random chance given the studies sample size. The answer is (C).", "philosophy": "The following are multiple choice questions (with answers) about philosophy.\n\nQ: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n(A) metaphysics (B) epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics studies the nature of reality and existence. The answer is (A).\n\nQ: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good. (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C) nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Before his conversion, Tolstoy feels that life was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ: According to d'Holbach, people always act according to _____.\n(A) free choices (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. d'Holbach believes that people act according to necessary laws, and it proves nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism is:\n(A) an ethical theory about how we ought to behave. (B) a generalization concerning the way people tend to behave. (C) a claim about human nature and the ways people are capable of behaving. (D) none of the above.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy for help. Psychological egoism suggests that one behaves based on what makes one feels good, hence it is a claim about human nature and how humans are capable of behaving. The answer is (C).", "prehistory": "The following are multiple choice questions (with answers) about prehistory.\n\nQ: What is the approximate mean cranial capacity of Homo erectus?\n(A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer is (C).\n\nQ: According to Timothy Pauketat, the evidence for social stratification and political power at Cahokia suggests:\n(A) a center of Mississippian civilization with conditions similar to the rise of early states. (B) the limitations of authority in a Native American society of egalitarian foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D. 1500. (D) a center of Mississippian civilization with conditions similar to societies on the Northwest Coast of North America.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian culture, where he found similar conditions to the rise of early states. The answer is (A).\n\nQ: Recent research on hominid species dating from the Middle Pliocene indicates there was (as of 2020):\n(A) a great amount of species diversity, or a single species that exhibited a lot of diversity. (B) very little species diversity during this period and very few hominids. (C) decreased species diversity due to a prolonged ice age followed by a severe drought. (D) decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Recent research has recognized multiple hominid species from the Middle Pliocene, meaning that there is a great amount of species diversity or diversity in a single species. The answer is (A).\n\nQ: Researchers now believe that the decline of the Maya was caused chiefly by:\n(A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological degradation resulting from slash-and-burn farming techniques. (C) endless wars between neighboring Mayan city-states. (D) practices of interbreeding that led to a steep rise in congenital disorders.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Researchers believe that the Maya collapse was mainly caused by over-exploitation of natural resources like the slash-and-burn farming techniques. The answer is (B).\n\nQ: The great Mayan king Pacal built temples in the city of Palenque in order to:\n(A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to the common people, since they were allowed to live in the temples. (C) frighten away enemies, in particular the Spaniards. (D) legitimize his kingship, since his father was not royal.\nA: Let's think step by step. We refer to Wikipedia articles on prehistory for help. Pacal built the temples as the funerary monument to legitimize his kingship. The answer is (D).", "professional_accounting": "The following are multiple choice questions (with answers) about professional accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019s subledger. Which of the following management assertions is supported by this test?\n(A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The completeness assertion is tested by tracing supporting documents to the record entries. The answer is (B).\n\nQ: One hundred years ago, your great-great-grandmother invested $100 at 5% yearly interest. What is the investment worth today?\n(A) $13,000 (B) $600 (C) $15,000 (D) $28,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. A $100 investment at 5% yearly interest is worth 100*(1.05)^100=13150 after 100 years, which is around $13,000. The answer is (A).\n\nQ: On January 1, year 1, Alpha Co. signed an annual maintenance agreement with a software provider for $15,000 and the maintenance period begins on March 1, year 1. Alpha also incurred $5,000 of costs on January 1, year 1, related to software modification requests that will increase the functionality of the software. Alpha depreciates and amortizes its computer and software assets over five years using the straight-line method. What amount is the total expense that Alpha should recognize related to the maintenance agreement and the software modifications for the year ended December 31, year 1?\n(A) $5,000 (B) $13,500 (C) $16,000 (D) $20,000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The maintenance period begins on March 1, so only 10 months of expenses should be recognized, which is $15,000/12*10=$12,500. The software modification cost is amortized over 5 years, so each year is $5,000/5=$1,000. So the total expense is $12,500+$1,000=$13,500. The answer is (B).\n\nQ: Krete is an unmarried taxpayer with income exclusively from wages. By December 31, year 1, Krete's employer has withheld $16,000 in federal income taxes and Krete has made no estimated tax payments. On April 15, year 2, Krete timely filed for an extension request to file her individual tax return, and paid $300 of additional taxes. Krete's year 1 tax liability was $16,500 when she timely filed her return on April 30, year 2, and paid the remaining tax liability balance. What amount would be subject to the penalty for underpayment of estimated taxes?\n(A) $0 (B) $500 (C) $1,650 (D) $16,500\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. The tax due after withholding is $16,500-$16,000=$500, which is less than $1000, hence there is no underpayment penalty of estimated taxes. The answer is (A).\n\nQ: Box a nongovernmental not-for-profit organization had the following transactions during the year: Proceeds from sale of investments $80000 Purchase of property plant and equipment $10000 Proceeds from long-term debt $100000 Loss on sale of investment $5000 What amount should be reported as net cash provided by financing activities in Box's statement of cash flows?\n(A) $70,000 (B) $75,000 (C) $80,000 (D) 100000\nA: Let's think step by step. We refer to Wikipedia articles on accounting for help. Among the four transactions, only Proceeds from long-term debt belongs to the financing activities section of cashflow, hence the amount reported should be $100000. The answer is (D).", "professional_law": "The following are multiple choice questions (with answers) about professional law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor and told him that he wanted to pay the son's debt. The father signed a document that stated the father would pay the son's debt at a rate of $500 a month for 10 months. The creditor made no written or oral commitment to forbear to sue the son to collect the $5,000 debt, and the father made no oral or written request for any such forbearance. For the next five months, the father made and the creditor accepted the $500 monthly payments as agreed. During that period, the creditor, in fact, did forbear to take any legal action against the son. However, the father then informed the creditor that he would make no further payments on the debt. Which of the following is the most persuasive argument that the father is liable to the creditor under the terms of their agreement?\n(A) The father's promise and the creditor's reliance thereon, if proved, gave rise to a valid claim by the creditor against the father based on the doctrine of promissory estoppel. (B) Because it was foreseeable that the father's promise would induce the creditor to forbear taking any action against the son, such forbearance was, as a matter of law, a bargained-for consideration for the father's promise. (C) The father's five payments to the creditor totaling $2,500 manifested a serious intent on the father's part to be contractually bound, and such manifestation is generally recognized as an effective substitute for consideration. (D) By assuming the antecedent debt obligation that the son owed to the creditor, the father became a surety whose promise to the creditor was enforceable, since it was in writing and supported by adequate consideration. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The doctrine of promissory estoppel stops a person from going back on a promise in contract law, hence option (A) should be the most persuasive argument. The answer is (A).\n\nQ: A state has recently enacted a statute prohibiting the disposal of any nuclear wastes within the state. This law does not contravene or conflict with any federal statutes. A man operates a company in the state that is engaged in the disposal of nuclear wastes. Subsequent to the passage of the state statute, the man, not yet aware of the new law, entered into contracts with many out-of-state firms to dispose of their nuclear wastes in the state. On account of this new law, however, the man will be unable to perform these contracts. Assume that the man has standing to challenge this state law. Which of the following presents his strongest constitutional grounds to challenge the state law prohibiting the disposal of nuclear wastes within the state?\n(A) The commerce clause. (B) The equal protection clause of the Fourteenth Amendment. (C) The privileges and immunities clause of Article IV, Section 2. (D) The contract clause.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The commerce clause states that Congress shall have the power to regulate commerce with foreign Nations, and among the several States, and with the Indian Tribes. The statute affects inter-state commerce which puts it into question. Hence the man's strongest argument should be the commerce clause. The answer is (A).\n\nQ: On October 1, 1980, a developer, owner of several hundred acres in a rural county, drafted a general development plan for the area. The duly recorded plan imposed elaborate limitations and restrictions upon the land in the plan, which was to be developed as a residential district. The restrictions were to extend to all persons acquiring any of the lots and to their heirs, assigns, and lessees. It was further provided that all subsequent owners would be charged with due notice of the restrictions. Among those restrictions in the general plan were the following:(22) A franchise right is created in a strip of land 10 feet in width along the rear of each lot for the use of public utility companies with right of ingress and egress. (23) No house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. In 2000, a retiree purchased one of the lots, built a house, and erected a fence in the rear of his property within the restricted area. In 2004, a teacher purchased a lot adjacent to the retiree's property and built a new house. Two years later, a librarian purchased the lot that adjoined the teacher's property. The three deeds to those properties each contained references to the deed book where the general plan was recorded. In 2008, the librarian began the construction of a seven-foot post-and-rail fence along the line dividing his lot with the teacher's, and along the center of the area subject to the franchise right. Although the teacher objected to its construction, the fence was completed. If the teacher seeks a mandatory injunction to compel removal of the librarian's fence, the court will most likely\n(A) grant relief, because the fence was in violation of the easement restriction. (B) grant relief, because the encroachment of the fence violated the restriction in the original plan. (C) deny relief, because the teacher failed to enforce the restriction against the retiree. (D) deny relief, because the fence would not be construed as \"a structure\" within the terms of the restriction. \nA: Let's think step by step. We refer to Wikipedia articles on law for help. The restrictions in the original plan say no house or structure of any kind shall be built on the aforementioned strip of land running through the said blocks. Hence the court will most likely grant relief because the fence violated the restriction in the original plan. The answer is (B).\n\nQ: Judge took judicial notice of some facts at the beginning of the trial. Which of the following is not an appropriate kind of fact for judicial notice?\n(A) Indisputable facts. (B) Facts that have been asserted by individual political organizations. (C) Facts recognized to be true by common knowledge. (D) Facts capable of scientific verification.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. Among the options, facts that have been asserted by individual political organizations is not an appropriate kind of fact for judicial notice. The answer is (B).\n\nQ: A state legislature has recently enacted a statute making it a misdemeanor to curse or revile or use obscene or opprobrious language toward or in reference to a police officer perfonning his duties. A student at a state university organized a demonstration on campus to protest the war. The rally was attended by a group of 50 students who shouted anti-war messages at cars passing by. To show his contempt for the United States, the student sewed the American flag to the rear of his jeans. When a police officer saw the flag sown on the student's jeans, he approached and told him to remove the flag or he would be placed under arrest. The student became angered and shouted at the police officer, \"Listen, you bastard, I'll wear this rag anywhere I please. \" The student was subsequently placed under arrest and charged with violating the state statute. The student subsequently brings suit in state court challenging the constitutionality of the statute. The strongest constitutional argument for the student is that\n(A) the statute is void for vagueness under the Fourteenth Amendment's due process clause. (B) the statute is invalid because it violates the petitioner's freedom of speech under the First Amendment. (C) the statute is an abridgment of freedom of speech under the First Amendment because less restrictive means are available for achieving the same purpose. (D) the statute is overbroad and consequently invalid under the First and Fourteenth Amendments.\nA: Let's think step by step. We refer to Wikipedia articles on law for help. The Fourteenth Amendment further supports the First Amendment by establishing a due process clause. Hence the strongest argument should be the statute is overbroad and consequently invalid under the First and Fourteenth Amendments. The answer is (D).", "professional_medicine": "The following are multiple choice questions (with answers) about professional medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with the complaint of right-sided rib pain when he runs long distances. Physical examination reveals normal heart and lung findings and an exhalation dysfunction at ribs\u00a04-5 on the right. Which of the following muscles or muscle groups will be most useful in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. Among the options, only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with a\u00a03-week\u00a0history of low back pain. He denies any recent trauma but says that he climbs in and out of his truck numerous times a day for his job. Examination of the patient in the prone position reveals a deep sacral sulcus on the left, a posterior inferior lateral angle on the right, and a lumbosacral junction that springs freely on compression. The most likely diagnosis is\n(A) left-on-left sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The deep sulcus on the left, a posterior ILA on the right, with a negative spring test suggests a right-on-right sacral torsion. All other options have a deep sulcus on the right. The answer is (D).\n\nQ: A 44-year-old man comes to the office because of a 3-day history of sore throat, nonproductive cough, runny nose, and frontal headache. He says the headache is worse in the morning and ibuprofen does provide some relief. He has not had shortness of breath. Medical history is unremarkable. He takes no medications other than the ibuprofen for pain. Vital signs are temperature 37.4\u00b0C (99.4\u00b0F), pulse 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares shows erythematous mucous membranes. Examination of the throat shows erythema and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable cervical adenopathy. Lungs are clear to auscultation. Which of the following is the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr virus (C) Mycoplasma pneumonia (D) Rhinovirus\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms, especially the headache, suggest that the most likely cause is Rhinovirus. Epstein-Barr virus will cause swollen lymph nodes but there is no palpable cervical adenopathy. Lungs are clear to auscultation suggests it's not Mycoplasma pneumonia. The answer is (D).\n\nQ: A previously healthy 32-year-old woman comes to the physician 8 months after her husband was killed in a car crash. Since that time, she has had a decreased appetite and difficulty falling asleep. She states that she is often sad and cries frequently. She has been rechecking the door lock five times before leaving her house and has to count exactly five pieces of toilet paper before she uses it. She says that she has always been a perfectionist but these urges and rituals are new. Pharmacotherapy should be targeted to which of the following neurotransmitters?\n(A) Dopamine (B) Glutamate (C) Norepinephrine (D) Serotonin\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The patient feels sad and among the options, only Dopamine and Serotonin can help increase positive emotions. Serotonin also affects digestion and metabolism, which can help the patient's decreased appetite and sleep difficulty. The answer is (D).\n\nQ: A 42-year-old man comes to the office for preoperative evaluation prior to undergoing adrenalectomy scheduled in 2 weeks. One month ago, he received care in the emergency department for pain over his right flank following a motor vehicle collision. At that time, blood pressure was 160/100 mm Hg and CT scan of the abdomen showed an incidental 10-cm left adrenal mass. Results of laboratory studies, including complete blood count, serum electrolyte concentrations, and liver function tests, were within the reference ranges. The patient otherwise had been healthy and had never been told that he had elevated blood pressure. He takes no medications. A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine and metanephrine and plasma aldosterone concentrations. The patient was referred to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature 36.6\u00b0C (97.9\u00b0F), pulse 100/min, respirations 14/min, and blood pressure 170/95 mm Hg. Physical examination discloses no significant findings. Initial preoperative preparation should include treatment with which of the following?\n(A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\nA: Let's think step by step. We refer to Wikipedia articles on medicine for help. The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure indicates hypertension. Phenoxybenzamine is used to treat hypertension caused by pheochromocytoma. The answer is (D).", "professional_psychology": "The following are multiple choice questions (with answers) about professional psychology.\n\nQ: In the construction of a multiple regression equation for purposes of prediction, the optimal combination of measures is one in which the predictors\n(A) are uncorrelated with each other but are moderately correlated with the criterion (B) have low correlations with each other and low correlations with the criterion (C) are highly intercorrelated with each other and moderately correlated with the criterion (D) have low correlations with the criterion bur are moderately correlated with each other\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The basis of multiple regression is to assess the relationship between one continuous variable and a set of independent variables. So the predictors should be uncorrelated with each other but are moderately correlated with the criterion. The answer is (A).\n\nQ: There are three ways to measure the Central Tendency: the Mean, the Median and the Mode. From your knowledge about them, what is the mode?\n(A) less sensitive to extreme scores than the mean (B) more useful for skewed distributions (C) sensitive to extreme values and highly skewed distributions (D) the most frequently occurring number\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. The definition of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl Jung believed that a client's transference:\n(A) is a fantasy that distracts the client from reality. (B) represents \u201cmixed feelings\u201d toward the therapist. (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Transference is a phenomenon that a person's feelings are unconsciously redirected, so it reflects the client's personal and collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980) five cultural dimensions, the United States scores at the top of the scale on:\n(A) individualism. (B) individualism and power distance. (C) power distance and masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. US scores highest on individualism among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy clients asks your advice about a good weight- reduction program. You have investigated the programs in the community and are enrolled in the one you consider the best. This program offers a $50 bonus to its patrons for each new person they bring into the program. Under these circumstances, your most appropriate response would be to\n(A) tell your client the pros and cons of each program you know about except for the one in which you are enrolled (B) recommend to your client the program in which you are enrolled and explain the $50 bonus you will receive (C) recommend to your client the program in which you are enrolled and offer to have the $50 bonus credited to your client's account in the program (D) tell your client the pros and cons of each program you know about, but do not claim the $50 bonus if your client enrolls in your program\nA: Let's think step by step. We refer to Wikipedia articles on psychology for help. Based on the circumstances, you should tell your client about the pros and cons of each program, but it would be inappropriate to receive the bonus, so you should not claim the $50 bonus. The answer is (D).", "public_relations": "The following are multiple choice questions (with answers) about public relations.\n\nQ: Earth Hour was a campaign launched by which organization?\n(A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife Fund. The answer is (D).\n\nQ: In issues management, what is the most proactive approach to addressing negative or misleading information posted online about your organization?\n(A) Buy domain names that could be used by opposition groups. (B) Post anonymous comments on blogs to combat this information. (C) Prepare a news release that discredits the inaccurate information. (D) Make policy changes to address complaints highlighted on these sites.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In issues management, the most proactive approach to addressing negative or misleading information posted online is to make policy changes to address complaints highlighted on those sites. The answer is (D).\n\nQ: At which stage in the planning process would a situation analysis be carried out?\n(A) Defining the program (B) Planning the program (C) Taking action and implementing ideas (D) Evaluation of the program\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. Situation analyses are typically carried out during the planning process stage of defining the program. The answer is (A).\n\nQ: Which of these statements is true of the Vatican in 2010 at the time of the accusations of child abuse cover-ups?\n(A) There was a coordinated media response. (B) Consistent messages were communicated. (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility of the Vatican was upheld.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican took those criticisms as attacks on the Catholic Church. The answer is (C).\n\nQ: What should a public relations media practitioner do if she does not know the answer to a reporter's question?\n(A) Give the reporter other information she is certain is correct. (B) Say that the information is 'off the record' and will be disseminated later. (C) Say 'I don't know' and promise to provide the information later. (D) Say 'no comment,' rather than appear uninformed.\nA: Let's think step by step. We refer to Wikipedia articles on public relations for help. If a public relations media practitioner does not know the answer to a reporter's question, they should say 'I don't know' and offer to provide the information later. The answer is (C).", "security_studies": "The following are multiple choice questions (with answers) about security studies.\n\nQ: What are the frameworks of analysis within which terrorism has been considered (as of 2020)?\n(A) Competition between larger nations has resulted in some countries actively supporting terrorist groups to undermine the strength of rival states. Terrorist networks are extended patronage clubs maintained and paid for by their donor states and are conceptualised as being like state actors, to be dealt with using military force. (B) Globalization has enabled the internationalization of terrorist activities by opening up their operational space, although coordination is still managed from a geographical base. This suggests that terrorist groups are nationally structured which means that terrorism cannot be considered in terms of a war to be defeated militarily without having serious implications on the indigenous population. (C) Terrorism can be viewed as a problem to be resolved by military means (war on terrorism), by normal police techniques (terrorism as crime), or as a medical problem with underlying causes and symptoms (terrorism as disease). (D) Terrorism is viewed as a criminal problem. The criminalization of terrorism has two important implications. Firstly, it suggests that terrorism can be eradicated - terrorists can be caught and brought to trial by normal judicial proceedings thereby removing the threat from society - and secondly, it suggests that preventative crime techniques are applicable to prevent its development.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. (A) is wrong because it is not competition between larger nations that causes terrorism. \n(B) is wrong because globalization is not the cause of terrorism.\n(C) is correct because the US undertook the war on terrorism. \n(D) is wrong because preventative crime techniques will likely not end terrorism. The answer is (C).\n\nQ: Which of the following is the best lens through which to investigate the role of child soldiers?\n(A) Child soldiers are victims of combat that need re-education and rehabilitation. (B) Children and their mothers are not active subjects in warfare and are best considered as subjects in the private sphere. (C) Children are most often innocent bystanders in war and are best used as signifiers of peace. (D) Children have political subjecthood that is missed when they are considered as passive victims of warfare.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Child soliders as a political topic can be missed when they are considered passive victims of warfare. The answer is (D).\n\nQ: How can we best describe the relationship between the state-centric approach and the concept of human security?\n(A) There are such wide divisions within the human security framework regarding the nature of threats and referent objects that no widely applicable comparisons between state-centric approaches and human security can be drawn. (B) By adopting the framework of human security, the limitations of the realist state-centric approach become evident. Whilst human security defines the referent object as the person or population, state-centric approaches prioritise the security of the state, de-prioritizing the pursuit of human security. (C) The state-centric approach to security is a faction of human security, usually defined within the broad school of human security. By being state-centric this approach prioritises the individual as the referent object in security studies. (D) Both the state-centric and human-centric approaches to security are mutually exclusive and offer a sufficient analytic framework with which to understand the international security system. It is therefore the role of security analysts to determine which of these substantial concepts is correct, and which should be discarded.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Human security focuses on a person or population whereas state-centric approaches focus on the state while deprioritizing human security. The answer is (B).\n\nQ: In order to become securitized, a threat must be presented in which of these ways?\n(A) As an existential threat that requires immediate and extraordinary action, posing a threat to the survival of the state or to societal security. (B) As requiring immediate and extraordinary action by the state, threatening the survival of a referent object and therefore warranting the use of measures not normally employed in the political realm. (C) As an urgent threat to the survival of the referent object, so serious that it legitimises the employment of extraordinary action in response. (D) As an urgent threat to the survival of the audience that requires extraordinary or emergency measures.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. To be securitized, a threat must be an urgent threat to the survival of the referent object. The answer is (C).\n\nQ: What distinguishes coercive diplomacy from military force?\n(A) Compellence is another term for coercive diplomacy, but covering a narrower set of criteria; compellence covers those threats aimed at initiating adversary action. A threat to coerce a state to give up part of its territory would count as coercive diplomacy, as long as that threat proactively initiates action before reactive diplomacy is taken. (B) Coercive diplomacy constitutes the threats of limited force to induce adversary's incentive to comply with the coercer's demands. It is an influence strategy that is intended to obtain compliance: the use of force to defeat an opponent first does not count. It leaves an element of choice with the target to comply, or to continue. (C) Military force, or the threat of military force, utilises fear to achieve strategic objectives. Coercive diplomacy is differentiated from this approach, because it does not use fear as a tool for coercing an adversary. (D) Coercive diplomacy is employed to use force but to limit its effects on the international community. Coercive diplomacy is an aggressive strategy that is intended to obtain compliance through defeat. It does not leave an element of choice with the target, the target either being forced to comply or engage in conflict. It seeks to control by imposing compliance by removing any opportunity for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia articles on security studies for help. Coercive diplomacy uses the threat of force to induce the opponent to comply with demands. The answer is (B).", "sociology": "The following are multiple choice questions (with answers) about sociology.\n\nQ: Which of the following is not a problem associated with official statistics on strike action?\n(A) most strikes go unnoticed by employers and the mass media (B) not all industrial disputes will be reported by the employer (C) the definition of strikes excludes those that involve fewer than ten workers or last less than one day (D) it is hard to compare strikes that were measured in different ways\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Official statistics on strike action can be problematic because not all industrial disputes will be reported by employers, the definition of strikes excludes those that involves fewer than ten workers or last less than one day, and it is hard to compare strikes that were measured in different ways. Thus, (A) is not a problem associated with official statistics on strike action. The answer is (A).\n\nQ: What does Berger (1963) describe as a metaphor for social reality?\n(A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Berger describes social reality using the metaphor of a puppet theatre. The answer is (C).\n\nQ: The term 'hegemony' refers to:\n(A) the tendency for the working class not to realize their own interests (B) a dominant ideology that legitimates economic, political and cultural power (C) a form of dual consciousness based on ideology and everyday experiences (D) a mode of payment given for outstanding topiary\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. Hegemony refers to a dominant ideology that legitimates economic, policital, and cultural power. The answer is (B).\n\nQ: The shift from 'civil religion' to 'common religion' means that:\n(A) the increasing bureaucracy of the state has made religion only a marginal part of our lives (B) despite the weakening of traditional authority, our everyday lives and 'common sense' remain shaped by religious beliefs and values (C) religious participation in collective worship may have declined, but people still practise their faiths in private (D) people are much more likely to discuss their religious beliefs in public, informal settings\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The shift from civil religion to common religion means that despite the weakening of traditional authority, our everyday lives and common sense remain shaped by religious beliefs and values. The answer is (B).\n\nQ: Which of the following did the post-war welfare state of 1948 not aim to provide:\n(A) free health care and education for all (B) a minimum wage (C) full employment (D) universal welfare\nA: Let's think step by step. We refer to Wikipedia articles on sociology for help. The post-war welfare state of 1948 aimed to provide free healthcare and education, full employment, and universal welfare. But it did not aim to provide a minimum wage. The answer is (B).", "us_foreign_policy": "The following are multiple choice questions (with answers) about us foreign policy.\n\nQ: How did Donald Trump attack globalization in the 2016 campaign?\n(A) Globalization had made men like him too rich (B) Globalization only benefited certain American states, such as New York (C) Liberal elites had encouraged globalization, while 'ordinary Americans' lost jobs because of it (D) Globalization encouraged damaging trade wars\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. Trump attacked globalization because he believed ordinary Americans lost jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer is (C).\n\nQ: How did NSC-68 change U.S. strategy?\n(A) It globalized containment. (B) It militarized containment. (C) It called for the development of the hydrogen bomb. (D) All of the above\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. NSC-68 outlined a variety of courses of action, including globalization of containment, militarization of contaiment, and the development of the hydrogen bomb. The answer is (D).\n\nQ: How do Defensive Realism and Offensive Realism differ in their explanation of state behaviour?\n(A) Defensive realists place greater emphasis on the role of international institutions (B) Defensive realists place less emphasis on geographical factors (C) Offensive realists give more priority to the national interest than Defensive realists. (D) Defensive realists believe states are security maximizers, while Offensive realists believe states to be power maximizers\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. While defensive realism advocates that states are security maximizers, offensive realists think of states as power maximizers. The answer is (D).\n\nQ: The realm of policy decisions concerned primarily with relations between the United States and the rest of the world is known as\n(A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international policy.\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The topic of policy decisions concerns with relations between the US and the rest of the world is known as foreign policy. The answer is (C).\n\nQ: How did the 2008 financial crisis affect America's international reputation?\n(A) It damaged support for the US model of political economy and capitalism (B) It created anger at the United States for exaggerating the crisis (C) It increased support for American global leadership under President Obama (D) It reduced global use of the US dollar\nA: Let's think step by step. We refer to Wikipedia articles on us foreign policy for help. The 2008 financial crisis damanged the international reputation of the American model of political economy and capitalism. The answer is (A).", "virology": "The following are multiple choice questions (with answers) about virology.\n\nQ: The median survival time to AIDS and death was established by following:\n(A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters (D) High-risk seronegatives\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The median survival time to AIDS and death was established as a result of the development of seroconverters. The answer is (C).\n\nQ: Which of the following is a morphological characteristic of the paramyxoviruses.\n(A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate viruses (C) Icosahedral viruses with envelope (D) Very large viruses\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside. The answer is (A).\n\nQ: The most important goal of a behavioral intervention is:\n(A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral theory (D) Sustained behavior change\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The prim goal of a behavioral intervention is to cause sustained behavior change. The answer is (D).\n\nQ: A key factor facilitating the application of nested case-control studies from the MACS was:\n(A) Data collection (B) Establishment of a repository of biologic specimens (C) Participant interest (D) Administration of the questionnaire by staff\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. The Multicenter AIDS Cohort Study's use of nested case-control studies was facilitated by the establishment of a repository of biologic specimens. The answer is (B).\n\nQ: Why are parvoviruses a highly impactful parasite?\n(A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate in dividing cells (D) Can integrate into host chromosomes\nA: Let's think step by step. We refer to Wikipedia articles on virology for help. Paroviruses are highly impactful because they do not have nucleic acid. The answer is (A).", "world_religions": "The following are multiple choice questions (with answers) about world religions.\n\nQ: How can the Upanishads be characterized?\n(A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Upanishads are the most recent part of Vedas (the oldest scriptures in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical texts. The answer is (B).\n\nQ: What is the Second Gem in Buddhism?\n(A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The Second Gem in Buddhism is The Dharma. The answer is (A).\n\nQ: Which Japanese government promoted a kind of national cult based on the emperor and his associations with kami?\n(A) Honen (B) Tanaka (C) Tokugawa (D) Meiji\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The promotion of a national cult based on the emperor and his associations with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is (D).\n\nQ: In which dynasty was the \"Mandate of Heaven\" developed to legitimatize the new rulers?\n(A) Shang (B) Zhou (C) Han (D) Xia\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. The \"Mandate of Heaven\" was developed as an ancient Chinese philosophical concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).\n\nQ: What is the sign of the covenant for Jewish males?\n(A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah\nA: Let's think step by step. We refer to Wikipedia articles on world religions for help. In Judaism, the most distinctive sign of the covenant is circumcision (brit milah). The answer is (B)."}
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index f0dbd6bd..542e11b2 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -74,12 +74,8 @@ SUBJECTS = [
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument(
-        "--save_prefix_path", default="flan"
-    )
-    parser.add_argument(
-        "--cot_prompt_path", default=None
-    )
+    parser.add_argument("--save_prefix_path", default="flan")
+    parser.add_argument("--cot_prompt_path", default=None)
     parser.add_argument("--task_prefix", default="")
     return parser.parse_args()
 
@@ -95,6 +91,7 @@ if __name__ == "__main__":
 
     if args.cot_prompt_path is not None:
         import json
+
         with open(args.cot_prompt_path) as f:
             cot_file = json.load(f)
 
@@ -106,7 +103,9 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
-            "task": f"mmlu_{args.task_prefix}_{subject}" if args.task_prefix != "" else f"mmlu_{subject}",
+            "task": f"mmlu_{args.task_prefix}_{subject}"
+            if args.task_prefix != ""
+            else f"mmlu_{subject}",
             "dataset_name": subject,
             "description": description,
         }
@@ -114,4 +113,10 @@ if __name__ == "__main__":
         file_save_path = args.save_prefix_path + f"_{subject}.yaml"
         eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
         with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index 757f7b5e..1064a2d1 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -12,4 +12,4 @@ metric_list:
     higher_is_better: true
   - metric: acc_norm
     aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index ffa9ee87..8461b93a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -21,4 +21,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
     ignore_case: true
-    ignore_punctuation: true
\ No newline at end of file
+    ignore_punctuation: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 2d5d92ef..5db2981a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -12,4 +12,4 @@ metric_list:
     higher_is_better: true
   - metric: acc_norm
     aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
-- 
GitLab


From c640376551f198e598d930333517010322b87527 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 21 Sep 2023 14:12:40 +0000
Subject: [PATCH 043/212] update

---
 lm_eval/tasks/benchmarks/flan_held_out.yaml |   8 +-
 lm_eval/tasks/benchmarks/t0_eval.yaml       | 148 ++++++++++----------
 2 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/lm_eval/tasks/benchmarks/flan_held_out.yaml b/lm_eval/tasks/benchmarks/flan_held_out.yaml
index f61affea..150e9477 100644
--- a/lm_eval/tasks/benchmarks/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan_held_out.yaml
@@ -3,10 +3,10 @@ task:
   # BBH
   - bbh_flan_zeroshot
   - bbh_flan_fewshot
-  - bbh_flan_cot_fewshot
-  - bbh_flan_cot_zeroshot
+  # - bbh_flan_cot_fewshot
+  # - bbh_flan_cot_zeroshot
   # MMLU
   - mmlu_flan_n_shot_generative
   - mmlu_flan_n_shot_loglikelihood
-  - mmlu_flan_cot_zeroshot
-  - mmlu_flan_cot_fewshot
+  # - mmlu_flan_cot_zeroshot
+  # - mmlu_flan_cot_fewshot
diff --git a/lm_eval/tasks/benchmarks/t0_eval.yaml b/lm_eval/tasks/benchmarks/t0_eval.yaml
index 9cd25b51..788122e9 100644
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
@@ -26,80 +26,80 @@ task:
         higher_is_better: true
         ignore_case: true
         ignore_punctuation: true
-  # # Natural Language Inference
-  # - dataset_path: super_glue
-  #   dataset_name: cb
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # - dataset_path: super_glue
-  #   dataset_name: rte
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # - task: anli_r1
-  #   dataset_path: anli
-  #   use_prompt: promptsource:*
-  #   training_split: train_r1
-  #   validation_split: dev_r1
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # - task: anli_r2
-  #   dataset_path: anli
-  #   use_prompt: promptsource:*
-  #   training_split: train_r2
-  #   validation_split: dev_r2
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # - task: anli_r3
-  #   dataset_path: anli
-  #   use_prompt: promptsource:*
-  #   training_split: train_r3
-  #   validation_split: dev_r3
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # # Sentence Completion
-  # - dataset_path: super_glue
-  #   dataset_name: copa
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   output_type: greedy_until
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: super_glue
+    dataset_name: cb
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r1
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r1
+    validation_split: dev_r1
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r2
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r2
+    validation_split: dev_r2
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r3
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r3
+    validation_split: dev_r3
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Sentence Completion
+  - dataset_path: super_glue
+    dataset_name: copa
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
   # Natural Language Inference
   - dataset_path: hellaswag
     use_prompt: promptsource:*
-- 
GitLab


From 94346b7ed8a699444d5f7b9defbb424134a45d7a Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 17:52:33 +0200
Subject: [PATCH 044/212] Allow forced import of metrics from the HF Evaluate
 library

---
 lm_eval/api/registry.py | 29 ++++++++++++++---------------
 lm_eval/api/task.py     |  5 +++--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 85d30f0f..e5da4b44 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -117,24 +117,23 @@ def register_metric(**args):
     return decorate
 
 
-def get_metric(name):
+def get_metric(name, hf_evaluate_metric=False):
 
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
+    
     try:
-        return METRIC_REGISTRY[name]
-    except KeyError:
-        # TODO: change this print to logging?
-        print(
-            f"Could not find registered metric '{name}' in lm-eval, \
-searching in HF Evaluate library..."
+        metric_object = evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
         )
-        try:
-            metric_object = evaluate.load(name)
-            return metric_object.compute
-        except Exception:
-            eval_logger.error(
-                "{} not found in the evaluate library!".format(name),
-                "Please check https://huggingface.co/evaluate-metric",
-            )
 
 
 def register_aggregation(name):
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 3f5d18dc..ba4ac150 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -555,8 +555,9 @@ class ConfigurableTask(Task):
                 kwargs = {
                     key: metric_config[key]
                     for key in metric_config
-                    if key not in ["metric", "aggregation", "higher_is_better"]
+                    if key not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                 }
+                hf_evaluate_metric = "hf_evaluate" in metric_config and metric_config["hf_evaluate"] == True
 
                 if self.config.process_results is not None:
                     self._metric_fn_list[metric_name] = None
@@ -567,7 +568,7 @@ class ConfigurableTask(Task):
                     self._metric_fn_list[metric_name] = metric_fn
                     self._metric_fn_kwargs[metric_name] = kwargs
                 else:
-                    self._metric_fn_list[metric_name] = get_metric(metric_name)
+                    self._metric_fn_list[metric_name] = get_metric(metric_name, hf_evaluate_metric)
                     self._metric_fn_kwargs[metric_name] = kwargs
 
                 if "aggregation" in metric_config:
-- 
GitLab


From 5b26b3b0e01f5d2bf4969ce32dd0cb0b1d7e4549 Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 17:57:49 +0200
Subject: [PATCH 045/212] Cast gold to the same type as result

---
 lm_eval/api/task.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index ba4ac150..d1aa8c58 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1069,6 +1069,7 @@ class ConfigurableTask(Task):
 
         elif self.OUTPUT_TYPE == "greedy_until":
             gold = self.doc_to_target(doc)
+            result = results[0]
             if self.config.doc_to_choice is not None:
                 # If you set doc_to_choice,
                 # it assumes that doc_to_target returns a number.
@@ -1077,10 +1078,10 @@ class ConfigurableTask(Task):
             # we expect multiple_targets to be a list.
             elif self.multiple_target:
                 gold = list(gold)
-            else:
-                gold = str(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
 
-            result = results[0]
             for metric in self._metric_fn_list.keys():
                 if self.multiple_target:
                     # in the case where we have multiple targets,
-- 
GitLab


From 90b261e5786ec7d1dcc0f3ef26431d5142dfa7bd Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 18:01:25 +0200
Subject: [PATCH 046/212] Add transformation filters: lowercase, uppercase, map

---
 lm_eval/filters/__init__.py       |  4 +++
 lm_eval/filters/transformation.py | 47 +++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 lm_eval/filters/transformation.py

diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py
index 54578cb4..c74ac015 100644
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -1,6 +1,7 @@
 from lm_eval.api.filter import FilterEnsemble
 from . import selection
 from . import extraction
+from . import transformation
 
 
 FILTER_REGISTRY = {
@@ -9,6 +10,9 @@ FILTER_REGISTRY = {
     "majority_vote": selection.MajorityVoteFilter,
     "take_first_k": selection.TakeKFilter,
     "remove_whitespace": extraction.WhitespaceFilter,
+    "lowercase": transformation.LowercaseFilter,
+    "uppercase": transformation.UppercaseFilter,
+    "map": transformation.MapFilter,
     # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
     # that takes an input and returns a scalar and then should select the max reward,
     # or should implement different filters for different ways of handling a reward model's inference.
diff --git a/lm_eval/filters/transformation.py b/lm_eval/filters/transformation.py
new file mode 100644
index 00000000..16e31a7c
--- /dev/null
+++ b/lm_eval/filters/transformation.py
@@ -0,0 +1,47 @@
+from lm_eval.api.filter import Filter
+
+
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+ 
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+        
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = {}, default_value = None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
-- 
GitLab


From b4c72cb9808bc23bb65b3eb56764361f6075da9f Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 18:09:30 +0200
Subject: [PATCH 047/212] Update new_task_guide.md

---
 docs/new_task_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 54745f47..0641baae 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -214,7 +214,7 @@ metric_list:
 ```
 `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).
 
-For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
+For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.
 
 ### Optional, More Advanced Setup
 
-- 
GitLab


From 34d1ac6bc35ca94808bac938d33e96b8577ac2dc Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 18:37:38 +0200
Subject: [PATCH 048/212] Use 'is' for boolean comparison

---
 lm_eval/api/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index d1aa8c58..95beb803 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -557,7 +557,7 @@ class ConfigurableTask(Task):
                     for key in metric_config
                     if key not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                 }
-                hf_evaluate_metric = "hf_evaluate" in metric_config and metric_config["hf_evaluate"] == True
+                hf_evaluate_metric = "hf_evaluate" in metric_config and metric_config["hf_evaluate"] is True
 
                 if self.config.process_results is not None:
                     self._metric_fn_list[metric_name] = None
-- 
GitLab


From ad4ab52aa4f0952fb4958518d3942b32eb914d28 Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 18:43:37 +0200
Subject: [PATCH 049/212] Fix formatting

---
 lm_eval/filters/transformation.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lm_eval/filters/transformation.py b/lm_eval/filters/transformation.py
index 16e31a7c..f1239638 100644
--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
@@ -11,7 +11,7 @@ class LowercaseFilter(Filter):
 
         return [filter_set(resp) for resp in resps]
 
- 
+
 class UppercaseFilter(Filter):
     def __init__(self) -> None:
         pass
@@ -21,9 +21,10 @@ class UppercaseFilter(Filter):
             return [resp.upper() for resp in inst]
 
         return [filter_set(resp) for resp in resps]
-        
+
+
 class MapFilter(Filter):
-    def __init__(self, mapping_dict: dict = {}, default_value = None) -> None:
+    def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
         """
         Initializes the MapFilter with a given mapping dictionary and default value.
 
-- 
GitLab


From 8310e67e7de114ef2c41e249cdebda9f2779c1cb Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Fri, 22 Sep 2023 18:59:44 +0200
Subject: [PATCH 050/212] Fix formatting

---
 lm_eval/api/registry.py           |  2 +-
 lm_eval/api/task.py               | 12 +++++++++---
 lm_eval/filters/transformation.py |  4 +++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index e5da4b44..53e5771a 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -126,7 +126,7 @@ def get_metric(name, hf_evaluate_metric=False):
             eval_logger.warning(
                 f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
             )
-    
+
     try:
         metric_object = evaluate.load(name)
         return metric_object.compute
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 95beb803..9c46214b 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -555,9 +555,13 @@ class ConfigurableTask(Task):
                 kwargs = {
                     key: metric_config[key]
                     for key in metric_config
-                    if key not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                 }
-                hf_evaluate_metric = "hf_evaluate" in metric_config and metric_config["hf_evaluate"] is True
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
 
                 if self.config.process_results is not None:
                     self._metric_fn_list[metric_name] = None
@@ -568,7 +572,9 @@ class ConfigurableTask(Task):
                     self._metric_fn_list[metric_name] = metric_fn
                     self._metric_fn_kwargs[metric_name] = kwargs
                 else:
-                    self._metric_fn_list[metric_name] = get_metric(metric_name, hf_evaluate_metric)
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
                     self._metric_fn_kwargs[metric_name] = kwargs
 
                 if "aggregation" in metric_config:
diff --git a/lm_eval/filters/transformation.py b/lm_eval/filters/transformation.py
index f1239638..f254b0db 100644
--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
@@ -37,7 +37,9 @@ class MapFilter(Filter):
         Example:
         mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
         """
-        assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
         self.mapping_dict = mapping_dict
         self.default_value = default_value
 
-- 
GitLab


From 0dc556dcf6285fbb3f49fb726bf3209431d93d05 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Mon, 25 Sep 2023 07:02:36 +0000
Subject: [PATCH 051/212] add function to add both task and benchmark

---
 lm_eval/tasks/__init__.py                     | 72 +++++++++++--------
 .../flan/prompt_templates/flan_anli.yaml      | 29 --------
 .../flan/prompt_templates/flan_arc.yaml       | 23 ------
 .../flan/prompt_templates/flan_boolq.yaml     | 33 ---------
 .../flan/prompt_templates/flan_rte.yaml       | 29 --------
 main.py                                       |  4 +-
 6 files changed, 46 insertions(+), 144 deletions(-)
 delete mode 100644 lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml
 delete mode 100644 lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml
 delete mode 100644 lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml
 delete mode 100644 lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c6a78e9c..b935d106 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -45,6 +45,18 @@ def register_configurable_group(config: Dict[str, str]) -> int:
     task_list = [task for task in all_task_list if type(task) == str]
 
     for task_config in config_list:
+        # if "task" in task_config:
+        #     task = task_config["task"]
+        #     if task in GROUP_REGISTRY:
+        #         task_list = GROUP_REGISTRY[task]
+        #     elif task in TASK_REGISTRY:
+        #         task_list = [TASK_REGISTRY[task]]
+        
+        #     for _task in task_list:
+        #         task_config = {
+        #             **_task["CONFIG"],
+        #             **task_config
+        #         }
         var_configs = check_prompt_config(
             {
                 **task_config,
@@ -109,36 +121,40 @@ def include_task_folder(task_dir: str, register_task=True) -> None:
     Calling this function
     """
     for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    yaml_path = os.path.join(root, f)
-                    try:
-                        config = utils.load_yaml_config(yaml_path)
-
-                        if register_task:
-                            all_configs = check_prompt_config(config)
-                            for config in all_configs:
-                                register_configurable_task(config)
-                        else:
-                            # If a `task` in config is a list,
-                            # that means it's a benchmark
-                            if type(config["task"]) == list:
-                                register_configurable_group(config)
-
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load config in\n"
-                            f"                                 {yaml_path}\n"
-                            "                                 Config will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-
+        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+        for f in file_list:
+            if f.endswith(".yaml"):
+                yaml_path = os.path.join(root, f)
+                try:
+                    config = utils.load_yaml_config(yaml_path)
+
+                    if register_task:
+                        all_configs = check_prompt_config(config)
+                        for config in all_configs:
+                            register_configurable_task(config)
+                    else:
+                        # If a `task` in config is a list,
+                        # that means it's a benchmark
+                        if type(config["task"]) == list:
+                            register_configurable_group(config)
+
+                except Exception as error:
+                    eval_logger.warning(
+                        "Failed to load config in\n"
+                        f"                                 {yaml_path}\n"
+                        "                                 Config will not be added to registry\n"
+                        f"                                 Error: {error}"
+                    )
+
+
+def include_path(task_dir):
+    include_task_folder(task_dir)
+    # Register Benchmarks after all tasks have been added
+    include_task_folder(task_dir, register_task=False)
+    return 0
 
 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_task_folder(task_dir)
-# Register Benchmarks after all tasks have been added
-include_task_folder(task_dir, register_task=False)
+include_path(task_dir)
 
 
 def get_task(task_name, config):
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml
deleted file mode 100644
index 6ff78840..00000000
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml
deleted file mode 100644
index 4ee34e65..00000000
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-1":
-    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-2":
-    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-3":
-    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-4":
-    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-5":
-    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-6":
-    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml
deleted file mode 100644
index f8c8ebfc..00000000
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-1":
-    doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-2":
-    doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-3":
-    doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-4":
-    doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-5":
-    doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-6":
-    doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-7":
-    doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-8":
-    # doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-9":
-    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml
deleted file mode 100644
index 7893eae4..00000000
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-7":
-    doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-8":
-    doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
diff --git a/main.py b/main.py
index 96207884..1e6302b7 100644
--- a/main.py
+++ b/main.py
@@ -10,7 +10,7 @@ from pathlib import Path
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
-from lm_eval.tasks import include_task_folder
+from lm_eval.tasks import include_path
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
@@ -111,7 +111,7 @@ def main() -> None:
 
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
-        include_task_folder(args.include_path)
+        include_path(args.include_path)
 
     if args.tasks is None:
         task_names = ALL_TASKS
-- 
GitLab


From 694af7d656989d6a22623e5245db655569b40390 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Mon, 25 Sep 2023 14:00:14 +0000
Subject: [PATCH 052/212] remove templerature as do_sample is False

---
 lm_eval/api/task.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 95c08af4..64c0d7e8 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -117,7 +117,6 @@ class TaskConfig(dict):
                     if self.fewshot_delimiter is None
                     else [self.fewshot_delimiter],
                     "do_sample": False,
-                    "temperature": 0.0,
                 }
 
         # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
-- 
GitLab


From c5ebdd0fac3ae074e7379e3d99d9744d8db53bcc Mon Sep 17 00:00:00 2001
From: ManuelFay <manuel.faysse@illuin.tech>
Date: Mon, 25 Sep 2023 16:42:59 +0200
Subject: [PATCH 053/212] add belebele

---
 lm_eval/tasks/README.md                       |  1 +
 lm_eval/tasks/belebele/README.md              | 47 +++++++++++++
 lm_eval/tasks/belebele/_default_template_yaml | 19 ++++++
 lm_eval/tasks/belebele/_generate_configs.py   | 67 +++++++++++++++++++
 lm_eval/tasks/belebele/belebele_acm_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_afr_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_als_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_amh_Ethi.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_apc_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_arb_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_arb_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ars_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ary_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_arz_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_asm_Beng.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_azj_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_bam_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ben_Beng.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ben_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_bod_Tibt.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_cat_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ceb_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ces_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ckb_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_dan_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_deu_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ell_Grek.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_eng_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_est_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_eus_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_fin_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_fra_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_fuv_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_gaz_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_grn_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_guj_Gujr.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hat_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hau_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_heb_Hebr.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hin_Deva.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hin_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hrv_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hun_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_hye_Armn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ibo_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ilo_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ind_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_isl_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ita_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_jav_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kac_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kan_Knda.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kat_Geor.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kea_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_khm_Khmr.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kin_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_kor_Hang.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_lao_Laoo.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_lin_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_lit_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_lug_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_luo_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_lvs_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mal_Mlym.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mar_Deva.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mlt_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mri_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_mya_Mymr.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_nld_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_nob_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_npi_Deva.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_npi_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_nso_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_nya_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ory_Orya.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_pan_Guru.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_pbt_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_pes_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_plt_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_pol_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_por_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ron_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_shn_Mymr.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_sin_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_sin_Sinh.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_slk_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_slv_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_sna_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_snd_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_som_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_sot_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_spa_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ssw_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_sun_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_swe_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_swh_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tam_Taml.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tel_Telu.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tgl_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tha_Thai.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tir_Ethi.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tsn_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tso_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_tur_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_urd_Arab.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_urd_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_uzn_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_vie_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_war_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_wol_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_xho_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_yor_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_zho_Hans.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_zho_Hant.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_zsm_Latn.yaml |  4 ++
 lm_eval/tasks/belebele/belebele_zul_Latn.yaml |  4 ++
 126 files changed, 622 insertions(+)
 create mode 100644 lm_eval/tasks/belebele/README.md
 create mode 100644 lm_eval/tasks/belebele/_default_template_yaml
 create mode 100644 lm_eval/tasks/belebele/_generate_configs.py
 create mode 100644 lm_eval/tasks/belebele/belebele_acm_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_afr_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_als_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_apc_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_arb_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_arb_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ars_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ary_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_arz_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_asm_Beng.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_azj_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_bam_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ben_Beng.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ben_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_cat_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ces_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_dan_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_deu_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ell_Grek.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_eng_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_est_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_eus_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_fin_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_fra_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_grn_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hat_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hau_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hin_Deva.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hin_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hun_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_hye_Armn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ind_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_isl_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ita_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_jav_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kac_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kan_Knda.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kat_Geor.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kea_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kin_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_kor_Hang.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_lin_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_lit_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_lug_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_luo_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mar_Deva.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mri_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_nld_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_nob_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_npi_Deva.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_npi_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_nso_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_nya_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ory_Orya.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_pan_Guru.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_pes_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_plt_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_pol_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_por_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ron_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_sin_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_slk_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_slv_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_sna_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_snd_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_som_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_sot_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_spa_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_sun_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_swe_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_swh_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tam_Taml.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tel_Telu.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tha_Thai.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tso_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_tur_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_urd_Arab.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_urd_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_vie_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_war_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_wol_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_xho_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_yor_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_zho_Hans.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_zho_Hant.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
 create mode 100644 lm_eval/tasks/belebele/belebele_zul_Latn.yaml

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index b95012b5..17521221 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] MGSM
 - [ ] SCROLLS
 - [x] Babi
+- [x] Belebele
 
 # Novel Tasks
 Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
diff --git a/lm_eval/tasks/belebele/README.md b/lm_eval/tasks/belebele/README.md
new file mode 100644
index 00000000..8855c7c6
--- /dev/null
+++ b/lm_eval/tasks/belebele/README.md
@@ -0,0 +1,47 @@
+# Belebele
+
+### Paper
+
+The Belebele Benchmark for Massively Multilingual NLU Evaluation
+https://arxiv.org/abs/2308.16884
+
+Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is reinforced by extensive quality checks. While all questions directly relate to the passage, the English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. Belebele opens up new avenues for evaluating and analyzing the multilingual abilities of language models and NLP systems.
+
+Homepage: https://github.com/facebookresearch/belebele
+
+### Citation
+
+```bibtex
+@misc{bandarkar2023belebele,
+      title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, 
+      author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
+      year={2023},
+      eprint={2308.16884},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `belebele`: All 122 languages of the Belebele dataset, evaluated following the methodology in MMLU's original implementation.
+
+#### Tasks
+
+
+The following tasks evaluate languages in the Belebele dataset using loglikelihood-based multiple-choice scoring:
+- `cmmlu_{language}`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
new file mode 100644
index 00000000..be3cf53b
--- /dev/null
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -0,0 +1,19 @@
+group: belebele
+dataset_path: facebook/belebele
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_text: "{{question.strip()}}\nA. {{mc_answer1}}\nB. {{mc_answer2}}\nC. {{mc_answer3}}\nD. {{mc_answer4}}\nAnswer："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
new file mode 100644
index 00000000..870d9773
--- /dev/null
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -0,0 +1,67 @@
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+import requests
+
+from tqdm import tqdm
+
+from lm_eval.logger import eval_logger
+
+API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="belebele")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
+    def query():
+        response = requests.get(API_URL)
+        return response.json()["splits"]
+
+    languages = [split["config"] for split in query()]
+
+    for lang in tqdm(languages):
+        description = f"A split of Belebele for the {lang} language.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"belebele_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"belebele_{lang}",
+            "dataset_name": lang,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
+        eval_logger.info(f"Saving yaml for subset {lang} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
diff --git a/lm_eval/tasks/belebele/belebele_acm_Arab.yaml b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
new file mode 100644
index 00000000..5059db8f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "acm_Arab"
+"description": "A split of Belebele for the acm_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_acm_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_afr_Latn.yaml b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
new file mode 100644
index 00000000..b290183f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "afr_Latn"
+"description": "A split of Belebele for the afr_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_afr_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_als_Latn.yaml b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
new file mode 100644
index 00000000..1bda097d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "als_Latn"
+"description": "A split of Belebele for the als_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_als_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
new file mode 100644
index 00000000..615570e7
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "amh_Ethi"
+"description": "A split of Belebele for the amh_Ethi language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_amh_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_apc_Arab.yaml b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
new file mode 100644
index 00000000..64102c05
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "apc_Arab"
+"description": "A split of Belebele for the apc_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_apc_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Arab.yaml b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
new file mode 100644
index 00000000..f789df9c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arb_Arab"
+"description": "A split of Belebele for the arb_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Latn.yaml b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
new file mode 100644
index 00000000..f9a4180e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arb_Latn"
+"description": "A split of Belebele for the arb_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ars_Arab.yaml b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
new file mode 100644
index 00000000..2c7d94c6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ars_Arab"
+"description": "A split of Belebele for the ars_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ars_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_ary_Arab.yaml b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
new file mode 100644
index 00000000..0a2d45b7
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ary_Arab"
+"description": "A split of Belebele for the ary_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ary_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arz_Arab.yaml b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
new file mode 100644
index 00000000..df78aeb1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arz_Arab"
+"description": "A split of Belebele for the arz_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_arz_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_asm_Beng.yaml b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
new file mode 100644
index 00000000..1319b62c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "asm_Beng"
+"description": "A split of Belebele for the asm_Beng language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_asm_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_azj_Latn.yaml b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
new file mode 100644
index 00000000..581b7625
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "azj_Latn"
+"description": "A split of Belebele for the azj_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_azj_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bam_Latn.yaml b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
new file mode 100644
index 00000000..82399037
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "bam_Latn"
+"description": "A split of Belebele for the bam_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_bam_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Beng.yaml b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
new file mode 100644
index 00000000..3fca76df
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ben_Beng"
+"description": "A split of Belebele for the ben_Beng language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ben_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Latn.yaml b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
new file mode 100644
index 00000000..a14f0e87
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ben_Latn"
+"description": "A split of Belebele for the ben_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ben_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
new file mode 100644
index 00000000..da1b65b4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "bod_Tibt"
+"description": "A split of Belebele for the bod_Tibt language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_bod_Tibt"
diff --git a/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
new file mode 100644
index 00000000..8d2cd4a0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "bul_Cyrl"
+"description": "A split of Belebele for the bul_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_bul_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_cat_Latn.yaml b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
new file mode 100644
index 00000000..b04cabba
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "cat_Latn"
+"description": "A split of Belebele for the cat_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_cat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
new file mode 100644
index 00000000..8df66adf
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ceb_Latn"
+"description": "A split of Belebele for the ceb_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ceb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ces_Latn.yaml b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
new file mode 100644
index 00000000..13b84c63
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ces_Latn"
+"description": "A split of Belebele for the ces_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ces_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
new file mode 100644
index 00000000..bf7465c1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ckb_Arab"
+"description": "A split of Belebele for the ckb_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ckb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_dan_Latn.yaml b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
new file mode 100644
index 00000000..55a7aa5e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "dan_Latn"
+"description": "A split of Belebele for the dan_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_dan_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_deu_Latn.yaml b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
new file mode 100644
index 00000000..e714c535
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "deu_Latn"
+"description": "A split of Belebele for the deu_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_deu_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ell_Grek.yaml b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
new file mode 100644
index 00000000..d36ffcf0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ell_Grek"
+"description": "A split of Belebele for the ell_Grek language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ell_Grek"
diff --git a/lm_eval/tasks/belebele/belebele_eng_Latn.yaml b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
new file mode 100644
index 00000000..3a50733f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "eng_Latn"
+"description": "A split of Belebele for the eng_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_eng_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_est_Latn.yaml b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
new file mode 100644
index 00000000..e7271fc5
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "est_Latn"
+"description": "A split of Belebele for the est_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_est_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_eus_Latn.yaml b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
new file mode 100644
index 00000000..36ba7097
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "eus_Latn"
+"description": "A split of Belebele for the eus_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_eus_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fin_Latn.yaml b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
new file mode 100644
index 00000000..4419a4d4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "fin_Latn"
+"description": "A split of Belebele for the fin_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_fin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fra_Latn.yaml b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
new file mode 100644
index 00000000..4c2798b6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "fra_Latn"
+"description": "A split of Belebele for the fra_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_fra_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
new file mode 100644
index 00000000..8b0cf15b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "fuv_Latn"
+"description": "A split of Belebele for the fuv_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_fuv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
new file mode 100644
index 00000000..58ac2426
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "gaz_Latn"
+"description": "A split of Belebele for the gaz_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_gaz_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_grn_Latn.yaml b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
new file mode 100644
index 00000000..e38a79c6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "grn_Latn"
+"description": "A split of Belebele for the grn_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_grn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
new file mode 100644
index 00000000..49a0bf09
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "guj_Gujr"
+"description": "A split of Belebele for the guj_Gujr language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_guj_Gujr"
diff --git a/lm_eval/tasks/belebele/belebele_hat_Latn.yaml b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
new file mode 100644
index 00000000..f35731bb
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hat_Latn"
+"description": "A split of Belebele for the hat_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hau_Latn.yaml b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
new file mode 100644
index 00000000..e7003c80
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hau_Latn"
+"description": "A split of Belebele for the hau_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hau_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
new file mode 100644
index 00000000..1ad5d3db
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "heb_Hebr"
+"description": "A split of Belebele for the heb_Hebr language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_heb_Hebr"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Deva.yaml b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
new file mode 100644
index 00000000..bc19d627
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hin_Deva"
+"description": "A split of Belebele for the hin_Deva language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hin_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Latn.yaml b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
new file mode 100644
index 00000000..c8908768
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hin_Latn"
+"description": "A split of Belebele for the hin_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
new file mode 100644
index 00000000..94589e0b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hrv_Latn"
+"description": "A split of Belebele for the hrv_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hrv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hun_Latn.yaml b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
new file mode 100644
index 00000000..c8cf626c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hun_Latn"
+"description": "A split of Belebele for the hun_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hye_Armn.yaml b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
new file mode 100644
index 00000000..4c9698c6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "hye_Armn"
+"description": "A split of Belebele for the hye_Armn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_hye_Armn"
diff --git a/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
new file mode 100644
index 00000000..4b30729f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ibo_Latn"
+"description": "A split of Belebele for the ibo_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ibo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
new file mode 100644
index 00000000..1780bb28
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ilo_Latn"
+"description": "A split of Belebele for the ilo_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ilo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ind_Latn.yaml b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
new file mode 100644
index 00000000..64eaa2bf
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ind_Latn"
+"description": "A split of Belebele for the ind_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ind_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_isl_Latn.yaml b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
new file mode 100644
index 00000000..f6dd5145
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "isl_Latn"
+"description": "A split of Belebele for the isl_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_isl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ita_Latn.yaml b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
new file mode 100644
index 00000000..8c84e2cb
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ita_Latn"
+"description": "A split of Belebele for the ita_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ita_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jav_Latn.yaml b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
new file mode 100644
index 00000000..64f5eb73
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jav_Latn"
+"description": "A split of Belebele for the jav_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_jav_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
new file mode 100644
index 00000000..32e0cdbc
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jpn_Jpan"
+"description": "A split of Belebele for the jpn_Jpan language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_jpn_Jpan"
diff --git a/lm_eval/tasks/belebele/belebele_kac_Latn.yaml b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
new file mode 100644
index 00000000..57a73540
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kac_Latn"
+"description": "A split of Belebele for the kac_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kac_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kan_Knda.yaml b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
new file mode 100644
index 00000000..4633623c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kan_Knda"
+"description": "A split of Belebele for the kan_Knda language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kan_Knda"
diff --git a/lm_eval/tasks/belebele/belebele_kat_Geor.yaml b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
new file mode 100644
index 00000000..2354d16f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kat_Geor"
+"description": "A split of Belebele for the kat_Geor language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kat_Geor"
diff --git a/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
new file mode 100644
index 00000000..60b3524e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kaz_Cyrl"
+"description": "A split of Belebele for the kaz_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kaz_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kea_Latn.yaml b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
new file mode 100644
index 00000000..cae6b6a9
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kea_Latn"
+"description": "A split of Belebele for the kea_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kea_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
new file mode 100644
index 00000000..8624d9f0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "khk_Cyrl"
+"description": "A split of Belebele for the khk_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_khk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
new file mode 100644
index 00000000..18be28c6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "khm_Khmr"
+"description": "A split of Belebele for the khm_Khmr language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_khm_Khmr"
diff --git a/lm_eval/tasks/belebele/belebele_kin_Latn.yaml b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
new file mode 100644
index 00000000..137d344d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kin_Latn"
+"description": "A split of Belebele for the kin_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
new file mode 100644
index 00000000..cbb01f8a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kir_Cyrl"
+"description": "A split of Belebele for the kir_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kir_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kor_Hang.yaml b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
new file mode 100644
index 00000000..2d50e396
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "kor_Hang"
+"description": "A split of Belebele for the kor_Hang language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_kor_Hang"
diff --git a/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
new file mode 100644
index 00000000..f2623c34
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "lao_Laoo"
+"description": "A split of Belebele for the lao_Laoo language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_lao_Laoo"
diff --git a/lm_eval/tasks/belebele/belebele_lin_Latn.yaml b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
new file mode 100644
index 00000000..6084a333
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "lin_Latn"
+"description": "A split of Belebele for the lin_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_lin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lit_Latn.yaml b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
new file mode 100644
index 00000000..fd78db81
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "lit_Latn"
+"description": "A split of Belebele for the lit_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_lit_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lug_Latn.yaml b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
new file mode 100644
index 00000000..9444b86b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "lug_Latn"
+"description": "A split of Belebele for the lug_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_lug_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_luo_Latn.yaml b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
new file mode 100644
index 00000000..3a719081
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "luo_Latn"
+"description": "A split of Belebele for the luo_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_luo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
new file mode 100644
index 00000000..be393dde
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "lvs_Latn"
+"description": "A split of Belebele for the lvs_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_lvs_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
new file mode 100644
index 00000000..24585d01
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mal_Mlym"
+"description": "A split of Belebele for the mal_Mlym language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mal_Mlym"
diff --git a/lm_eval/tasks/belebele/belebele_mar_Deva.yaml b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
new file mode 100644
index 00000000..7b04ff4a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mar_Deva"
+"description": "A split of Belebele for the mar_Deva language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mar_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
new file mode 100644
index 00000000..49f91a5f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mkd_Cyrl"
+"description": "A split of Belebele for the mkd_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mkd_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
new file mode 100644
index 00000000..1ecc0f38
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mlt_Latn"
+"description": "A split of Belebele for the mlt_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mlt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mri_Latn.yaml b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
new file mode 100644
index 00000000..86c35b59
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mri_Latn"
+"description": "A split of Belebele for the mri_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mri_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
new file mode 100644
index 00000000..ff1c10e8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "mya_Mymr"
+"description": "A split of Belebele for the mya_Mymr language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_mya_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_nld_Latn.yaml b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
new file mode 100644
index 00000000..f97417c7
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nld_Latn"
+"description": "A split of Belebele for the nld_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_nld_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nob_Latn.yaml b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
new file mode 100644
index 00000000..62284741
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nob_Latn"
+"description": "A split of Belebele for the nob_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_nob_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Deva.yaml b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
new file mode 100644
index 00000000..ecd7920b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "npi_Deva"
+"description": "A split of Belebele for the npi_Deva language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_npi_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Latn.yaml b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
new file mode 100644
index 00000000..439730b5
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "npi_Latn"
+"description": "A split of Belebele for the npi_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_npi_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nso_Latn.yaml b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
new file mode 100644
index 00000000..24821280
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nso_Latn"
+"description": "A split of Belebele for the nso_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_nso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nya_Latn.yaml b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
new file mode 100644
index 00000000..987fa736
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nya_Latn"
+"description": "A split of Belebele for the nya_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_nya_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ory_Orya.yaml b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
new file mode 100644
index 00000000..0cbb9bf8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ory_Orya"
+"description": "A split of Belebele for the ory_Orya language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ory_Orya"
diff --git a/lm_eval/tasks/belebele/belebele_pan_Guru.yaml b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
new file mode 100644
index 00000000..c266060c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "pan_Guru"
+"description": "A split of Belebele for the pan_Guru language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_pan_Guru"
diff --git a/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
new file mode 100644
index 00000000..f018adca
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "pbt_Arab"
+"description": "A split of Belebele for the pbt_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_pbt_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_pes_Arab.yaml b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
new file mode 100644
index 00000000..be1a4678
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "pes_Arab"
+"description": "A split of Belebele for the pes_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_pes_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_plt_Latn.yaml b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
new file mode 100644
index 00000000..ef0ea9c5
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "plt_Latn"
+"description": "A split of Belebele for the plt_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_plt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_pol_Latn.yaml b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
new file mode 100644
index 00000000..5091fa9a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "pol_Latn"
+"description": "A split of Belebele for the pol_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_pol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_por_Latn.yaml b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
new file mode 100644
index 00000000..4d735f1c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "por_Latn"
+"description": "A split of Belebele for the por_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_por_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ron_Latn.yaml b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
new file mode 100644
index 00000000..454b1682
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ron_Latn"
+"description": "A split of Belebele for the ron_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ron_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
new file mode 100644
index 00000000..7e2be793
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "rus_Cyrl"
+"description": "A split of Belebele for the rus_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_rus_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
new file mode 100644
index 00000000..3ebc839f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "shn_Mymr"
+"description": "A split of Belebele for the shn_Mymr language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_shn_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Latn.yaml b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
new file mode 100644
index 00000000..05953e39
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sin_Latn"
+"description": "A split of Belebele for the sin_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_sin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
new file mode 100644
index 00000000..ab802a87
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sin_Sinh"
+"description": "A split of Belebele for the sin_Sinh language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_sin_Sinh"
diff --git a/lm_eval/tasks/belebele/belebele_slk_Latn.yaml b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
new file mode 100644
index 00000000..023139f8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "slk_Latn"
+"description": "A split of Belebele for the slk_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_slk_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_slv_Latn.yaml b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
new file mode 100644
index 00000000..5de85c80
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "slv_Latn"
+"description": "A split of Belebele for the slv_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_slv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sna_Latn.yaml b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
new file mode 100644
index 00000000..fc624123
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sna_Latn"
+"description": "A split of Belebele for the sna_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_sna_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_snd_Arab.yaml b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
new file mode 100644
index 00000000..ce41c40e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "snd_Arab"
+"description": "A split of Belebele for the snd_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_snd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_som_Latn.yaml b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
new file mode 100644
index 00000000..330c2da3
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "som_Latn"
+"description": "A split of Belebele for the som_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_som_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sot_Latn.yaml b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
new file mode 100644
index 00000000..dcc0f9cc
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sot_Latn"
+"description": "A split of Belebele for the sot_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_sot_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_spa_Latn.yaml b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
new file mode 100644
index 00000000..b86137af
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "spa_Latn"
+"description": "A split of Belebele for the spa_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_spa_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
new file mode 100644
index 00000000..2f4307a3
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "srp_Cyrl"
+"description": "A split of Belebele for the srp_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_srp_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
new file mode 100644
index 00000000..f83780bd
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ssw_Latn"
+"description": "A split of Belebele for the ssw_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ssw_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sun_Latn.yaml b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
new file mode 100644
index 00000000..fe41aead
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sun_Latn"
+"description": "A split of Belebele for the sun_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_sun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swe_Latn.yaml b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
new file mode 100644
index 00000000..97c0759f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "swe_Latn"
+"description": "A split of Belebele for the swe_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_swe_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swh_Latn.yaml b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
new file mode 100644
index 00000000..5ab3464a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "swh_Latn"
+"description": "A split of Belebele for the swh_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_swh_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tam_Taml.yaml b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
new file mode 100644
index 00000000..05a84212
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tam_Taml"
+"description": "A split of Belebele for the tam_Taml language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tam_Taml"
diff --git a/lm_eval/tasks/belebele/belebele_tel_Telu.yaml b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
new file mode 100644
index 00000000..30c5f89e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tel_Telu"
+"description": "A split of Belebele for the tel_Telu language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tel_Telu"
diff --git a/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
new file mode 100644
index 00000000..1a68bb92
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tgk_Cyrl"
+"description": "A split of Belebele for the tgk_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tgk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
new file mode 100644
index 00000000..42a2d8d0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tgl_Latn"
+"description": "A split of Belebele for the tgl_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tgl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tha_Thai.yaml b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
new file mode 100644
index 00000000..254c046e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tha_Thai"
+"description": "A split of Belebele for the tha_Thai language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tha_Thai"
diff --git a/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
new file mode 100644
index 00000000..c456c566
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tir_Ethi"
+"description": "A split of Belebele for the tir_Ethi language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tir_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
new file mode 100644
index 00000000..77ef814e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tsn_Latn"
+"description": "A split of Belebele for the tsn_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tsn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tso_Latn.yaml b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
new file mode 100644
index 00000000..cbfebb32
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tso_Latn"
+"description": "A split of Belebele for the tso_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tur_Latn.yaml b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
new file mode 100644
index 00000000..61a55691
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "tur_Latn"
+"description": "A split of Belebele for the tur_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_tur_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
new file mode 100644
index 00000000..bf496a6d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ukr_Cyrl"
+"description": "A split of Belebele for the ukr_Cyrl language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_ukr_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Arab.yaml b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
new file mode 100644
index 00000000..b210f8b6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "urd_Arab"
+"description": "A split of Belebele for the urd_Arab language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_urd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Latn.yaml b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
new file mode 100644
index 00000000..d48e79e6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "urd_Latn"
+"description": "A split of Belebele for the urd_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_urd_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
new file mode 100644
index 00000000..e45b09e9
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "uzn_Latn"
+"description": "A split of Belebele for the uzn_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_uzn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_vie_Latn.yaml b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
new file mode 100644
index 00000000..a420a4c9
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "vie_Latn"
+"description": "A split of Belebele for the vie_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_vie_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_war_Latn.yaml b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
new file mode 100644
index 00000000..45dfd5fa
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "war_Latn"
+"description": "A split of Belebele for the war_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_war_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_wol_Latn.yaml b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
new file mode 100644
index 00000000..1cac6be5
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "wol_Latn"
+"description": "A split of Belebele for the wol_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_wol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_xho_Latn.yaml b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
new file mode 100644
index 00000000..a1e36894
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "xho_Latn"
+"description": "A split of Belebele for the xho_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_xho_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_yor_Latn.yaml b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
new file mode 100644
index 00000000..2de6aecb
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "yor_Latn"
+"description": "A split of Belebele for the yor_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_yor_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
new file mode 100644
index 00000000..cdafe7f8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "zho_Hans"
+"description": "A split of Belebele for the zho_Hans language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_zho_Hans"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
new file mode 100644
index 00000000..531af9e1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "zho_Hant"
+"description": "A split of Belebele for the zho_Hant language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_zho_Hant"
diff --git a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
new file mode 100644
index 00000000..b005512f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "zsm_Latn"
+"description": "A split of Belebele for the zsm_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_zsm_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
new file mode 100644
index 00000000..04cc5b15
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "zul_Latn"
+"description": "A split of Belebele for the zul_Latn language.\n\n"
+"include": "_default_template_yaml"
+"task": "belebele_zul_Latn"
-- 
GitLab


From 2d6bc236290c529025e811da655d5052a5d39348 Mon Sep 17 00:00:00 2001
From: ManuelFay <manuel.faysse@illuin.tech>
Date: Mon, 25 Sep 2023 19:37:03 +0200
Subject: [PATCH 054/212] remove description

---
 lm_eval/tasks/belebele/_generate_configs.py   | 5 +----
 lm_eval/tasks/belebele/belebele_acm_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_afr_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_als_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_amh_Ethi.yaml | 1 -
 lm_eval/tasks/belebele/belebele_apc_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_arb_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_arb_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ars_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ary_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_arz_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_asm_Beng.yaml | 1 -
 lm_eval/tasks/belebele/belebele_azj_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_bam_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ben_Beng.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ben_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_bod_Tibt.yaml | 1 -
 lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_cat_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ceb_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ces_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ckb_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_dan_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_deu_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ell_Grek.yaml | 1 -
 lm_eval/tasks/belebele/belebele_eng_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_est_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_eus_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_fin_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_fra_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_fuv_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_gaz_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_grn_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_guj_Gujr.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hat_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hau_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_heb_Hebr.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hin_Deva.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hin_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hrv_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hun_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_hye_Armn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ibo_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ilo_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ind_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_isl_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ita_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_jav_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kac_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kan_Knda.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kat_Geor.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kea_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_khm_Khmr.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kin_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_kor_Hang.yaml | 1 -
 lm_eval/tasks/belebele/belebele_lao_Laoo.yaml | 1 -
 lm_eval/tasks/belebele/belebele_lin_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_lit_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_lug_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_luo_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_lvs_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mal_Mlym.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mar_Deva.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mlt_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mri_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_mya_Mymr.yaml | 1 -
 lm_eval/tasks/belebele/belebele_nld_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_nob_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_npi_Deva.yaml | 1 -
 lm_eval/tasks/belebele/belebele_npi_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_nso_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_nya_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ory_Orya.yaml | 1 -
 lm_eval/tasks/belebele/belebele_pan_Guru.yaml | 1 -
 lm_eval/tasks/belebele/belebele_pbt_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_pes_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_plt_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_pol_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_por_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ron_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_shn_Mymr.yaml | 1 -
 lm_eval/tasks/belebele/belebele_sin_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_sin_Sinh.yaml | 1 -
 lm_eval/tasks/belebele/belebele_slk_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_slv_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_sna_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_snd_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_som_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_sot_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_spa_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ssw_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_sun_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_swe_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_swh_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tam_Taml.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tel_Telu.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tgl_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tha_Thai.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tir_Ethi.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tsn_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tso_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_tur_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml | 1 -
 lm_eval/tasks/belebele/belebele_urd_Arab.yaml | 1 -
 lm_eval/tasks/belebele/belebele_urd_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_uzn_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_vie_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_war_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_wol_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_xho_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_yor_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_zho_Hans.yaml | 1 -
 lm_eval/tasks/belebele/belebele_zho_Hant.yaml | 1 -
 lm_eval/tasks/belebele/belebele_zsm_Latn.yaml | 1 -
 lm_eval/tasks/belebele/belebele_zul_Latn.yaml | 1 -
 123 files changed, 1 insertion(+), 126 deletions(-)

diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
index 870d9773..9df56f5f 100644
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -44,15 +44,12 @@ if __name__ == "__main__":
     languages = [split["config"] for split in query()]
 
     for lang in tqdm(languages):
-        description = f"A split of Belebele for the {lang} language.\n\n"
-
         yaml_dict = {
             "include": base_yaml_name,
-            "task": f"belebele_{args.task_prefix}_{subject_eng}"
+            "task": f"belebele_{args.task_prefix}_{lang}"
             if args.task_prefix != ""
             else f"belebele_{lang}",
             "dataset_name": lang,
-            "description": description,
         }
 
         file_save_path = args.save_prefix_path + f"_{lang}.yaml"
diff --git a/lm_eval/tasks/belebele/belebele_acm_Arab.yaml b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
index 5059db8f..7afe81f4 100644
--- a/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "acm_Arab"
-"description": "A split of Belebele for the acm_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_acm_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_afr_Latn.yaml b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
index b290183f..8ced8ffc 100644
--- a/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "afr_Latn"
-"description": "A split of Belebele for the afr_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_afr_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_als_Latn.yaml b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
index 1bda097d..507fe758 100644
--- a/lm_eval/tasks/belebele/belebele_als_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "als_Latn"
-"description": "A split of Belebele for the als_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_als_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
index 615570e7..a0b4bd0a 100644
--- a/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
+++ b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "amh_Ethi"
-"description": "A split of Belebele for the amh_Ethi language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_amh_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_apc_Arab.yaml b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
index 64102c05..d5d6777f 100644
--- a/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "apc_Arab"
-"description": "A split of Belebele for the apc_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_apc_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Arab.yaml b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
index f789df9c..a61d78ed 100644
--- a/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "arb_Arab"
-"description": "A split of Belebele for the arb_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_arb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Latn.yaml b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
index f9a4180e..97684654 100644
--- a/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "arb_Latn"
-"description": "A split of Belebele for the arb_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_arb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ars_Arab.yaml b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
index 2c7d94c6..ac322a6b 100644
--- a/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ars_Arab"
-"description": "A split of Belebele for the ars_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ars_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_ary_Arab.yaml b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
index 0a2d45b7..78bb8e29 100644
--- a/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ary_Arab"
-"description": "A split of Belebele for the ary_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ary_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arz_Arab.yaml b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
index df78aeb1..28d8565f 100644
--- a/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "arz_Arab"
-"description": "A split of Belebele for the arz_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_arz_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_asm_Beng.yaml b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
index 1319b62c..fcc708df 100644
--- a/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
+++ b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "asm_Beng"
-"description": "A split of Belebele for the asm_Beng language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_asm_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_azj_Latn.yaml b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
index 581b7625..a5add1b7 100644
--- a/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "azj_Latn"
-"description": "A split of Belebele for the azj_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_azj_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bam_Latn.yaml b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
index 82399037..7c2585d7 100644
--- a/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "bam_Latn"
-"description": "A split of Belebele for the bam_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_bam_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Beng.yaml b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
index 3fca76df..62e9ea10 100644
--- a/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
+++ b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ben_Beng"
-"description": "A split of Belebele for the ben_Beng language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ben_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Latn.yaml b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
index a14f0e87..9ed8d7fa 100644
--- a/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ben_Latn"
-"description": "A split of Belebele for the ben_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ben_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
index da1b65b4..8cf0464e 100644
--- a/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
+++ b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "bod_Tibt"
-"description": "A split of Belebele for the bod_Tibt language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_bod_Tibt"
diff --git a/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
index 8d2cd4a0..a34c29e3 100644
--- a/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "bul_Cyrl"
-"description": "A split of Belebele for the bul_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_bul_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_cat_Latn.yaml b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
index b04cabba..fa6af717 100644
--- a/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "cat_Latn"
-"description": "A split of Belebele for the cat_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_cat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
index 8df66adf..d2d0a2dd 100644
--- a/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ceb_Latn"
-"description": "A split of Belebele for the ceb_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ceb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ces_Latn.yaml b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
index 13b84c63..ee7b6894 100644
--- a/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ces_Latn"
-"description": "A split of Belebele for the ces_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ces_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
index bf7465c1..02277ba5 100644
--- a/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ckb_Arab"
-"description": "A split of Belebele for the ckb_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ckb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_dan_Latn.yaml b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
index 55a7aa5e..45555e9d 100644
--- a/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "dan_Latn"
-"description": "A split of Belebele for the dan_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_dan_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_deu_Latn.yaml b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
index e714c535..543d32a8 100644
--- a/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "deu_Latn"
-"description": "A split of Belebele for the deu_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_deu_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ell_Grek.yaml b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
index d36ffcf0..8b5bc5ad 100644
--- a/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
+++ b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ell_Grek"
-"description": "A split of Belebele for the ell_Grek language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ell_Grek"
diff --git a/lm_eval/tasks/belebele/belebele_eng_Latn.yaml b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
index 3a50733f..f02cdeb9 100644
--- a/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "eng_Latn"
-"description": "A split of Belebele for the eng_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_eng_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_est_Latn.yaml b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
index e7271fc5..3d74778c 100644
--- a/lm_eval/tasks/belebele/belebele_est_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "est_Latn"
-"description": "A split of Belebele for the est_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_est_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_eus_Latn.yaml b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
index 36ba7097..18711684 100644
--- a/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "eus_Latn"
-"description": "A split of Belebele for the eus_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_eus_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fin_Latn.yaml b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
index 4419a4d4..898a6e7a 100644
--- a/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "fin_Latn"
-"description": "A split of Belebele for the fin_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_fin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fra_Latn.yaml b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
index 4c2798b6..154b62d8 100644
--- a/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "fra_Latn"
-"description": "A split of Belebele for the fra_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_fra_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
index 8b0cf15b..8015f090 100644
--- a/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "fuv_Latn"
-"description": "A split of Belebele for the fuv_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_fuv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
index 58ac2426..c671796f 100644
--- a/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "gaz_Latn"
-"description": "A split of Belebele for the gaz_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_gaz_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_grn_Latn.yaml b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
index e38a79c6..fbb2b8aa 100644
--- a/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "grn_Latn"
-"description": "A split of Belebele for the grn_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_grn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
index 49a0bf09..02e716d0 100644
--- a/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
+++ b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "guj_Gujr"
-"description": "A split of Belebele for the guj_Gujr language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_guj_Gujr"
diff --git a/lm_eval/tasks/belebele/belebele_hat_Latn.yaml b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
index f35731bb..691da434 100644
--- a/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hat_Latn"
-"description": "A split of Belebele for the hat_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hau_Latn.yaml b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
index e7003c80..ff94e767 100644
--- a/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hau_Latn"
-"description": "A split of Belebele for the hau_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hau_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
index 1ad5d3db..b46a240f 100644
--- a/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
+++ b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "heb_Hebr"
-"description": "A split of Belebele for the heb_Hebr language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_heb_Hebr"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Deva.yaml b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
index bc19d627..e3e7c1ae 100644
--- a/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
+++ b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hin_Deva"
-"description": "A split of Belebele for the hin_Deva language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hin_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Latn.yaml b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
index c8908768..37085a32 100644
--- a/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hin_Latn"
-"description": "A split of Belebele for the hin_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
index 94589e0b..1b501540 100644
--- a/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hrv_Latn"
-"description": "A split of Belebele for the hrv_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hrv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hun_Latn.yaml b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
index c8cf626c..6d211f59 100644
--- a/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hun_Latn"
-"description": "A split of Belebele for the hun_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hye_Armn.yaml b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
index 4c9698c6..6752b2f7 100644
--- a/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
+++ b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "hye_Armn"
-"description": "A split of Belebele for the hye_Armn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_hye_Armn"
diff --git a/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
index 4b30729f..17e48a75 100644
--- a/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ibo_Latn"
-"description": "A split of Belebele for the ibo_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ibo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
index 1780bb28..457aa2e1 100644
--- a/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ilo_Latn"
-"description": "A split of Belebele for the ilo_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ilo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ind_Latn.yaml b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
index 64eaa2bf..c90532f4 100644
--- a/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ind_Latn"
-"description": "A split of Belebele for the ind_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ind_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_isl_Latn.yaml b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
index f6dd5145..eece64e1 100644
--- a/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "isl_Latn"
-"description": "A split of Belebele for the isl_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_isl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ita_Latn.yaml b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
index 8c84e2cb..ac958a6a 100644
--- a/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ita_Latn"
-"description": "A split of Belebele for the ita_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ita_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jav_Latn.yaml b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
index 64f5eb73..57435d1a 100644
--- a/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "jav_Latn"
-"description": "A split of Belebele for the jav_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_jav_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
index 32e0cdbc..b6d09451 100644
--- a/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
+++ b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "jpn_Jpan"
-"description": "A split of Belebele for the jpn_Jpan language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_jpn_Jpan"
diff --git a/lm_eval/tasks/belebele/belebele_kac_Latn.yaml b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
index 57a73540..090c1356 100644
--- a/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kac_Latn"
-"description": "A split of Belebele for the kac_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kac_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kan_Knda.yaml b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
index 4633623c..0085fff8 100644
--- a/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
+++ b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kan_Knda"
-"description": "A split of Belebele for the kan_Knda language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kan_Knda"
diff --git a/lm_eval/tasks/belebele/belebele_kat_Geor.yaml b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
index 2354d16f..0b681bba 100644
--- a/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
+++ b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kat_Geor"
-"description": "A split of Belebele for the kat_Geor language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kat_Geor"
diff --git a/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
index 60b3524e..70c7c155 100644
--- a/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kaz_Cyrl"
-"description": "A split of Belebele for the kaz_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kaz_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kea_Latn.yaml b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
index cae6b6a9..aae70568 100644
--- a/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kea_Latn"
-"description": "A split of Belebele for the kea_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kea_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
index 8624d9f0..53d8839e 100644
--- a/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "khk_Cyrl"
-"description": "A split of Belebele for the khk_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_khk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
index 18be28c6..ef388cd2 100644
--- a/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
+++ b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "khm_Khmr"
-"description": "A split of Belebele for the khm_Khmr language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_khm_Khmr"
diff --git a/lm_eval/tasks/belebele/belebele_kin_Latn.yaml b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
index 137d344d..edfeb80a 100644
--- a/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kin_Latn"
-"description": "A split of Belebele for the kin_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
index cbb01f8a..a6cb7a4d 100644
--- a/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kir_Cyrl"
-"description": "A split of Belebele for the kir_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kir_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kor_Hang.yaml b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
index 2d50e396..ece7f55e 100644
--- a/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
+++ b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "kor_Hang"
-"description": "A split of Belebele for the kor_Hang language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_kor_Hang"
diff --git a/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
index f2623c34..6012ac1c 100644
--- a/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
+++ b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "lao_Laoo"
-"description": "A split of Belebele for the lao_Laoo language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_lao_Laoo"
diff --git a/lm_eval/tasks/belebele/belebele_lin_Latn.yaml b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
index 6084a333..fb33b859 100644
--- a/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "lin_Latn"
-"description": "A split of Belebele for the lin_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_lin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lit_Latn.yaml b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
index fd78db81..e9943bd0 100644
--- a/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "lit_Latn"
-"description": "A split of Belebele for the lit_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_lit_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lug_Latn.yaml b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
index 9444b86b..19d4f056 100644
--- a/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "lug_Latn"
-"description": "A split of Belebele for the lug_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_lug_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_luo_Latn.yaml b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
index 3a719081..73cc0aee 100644
--- a/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "luo_Latn"
-"description": "A split of Belebele for the luo_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_luo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
index be393dde..18d291e5 100644
--- a/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "lvs_Latn"
-"description": "A split of Belebele for the lvs_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_lvs_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
index 24585d01..283c67b2 100644
--- a/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
+++ b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mal_Mlym"
-"description": "A split of Belebele for the mal_Mlym language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mal_Mlym"
diff --git a/lm_eval/tasks/belebele/belebele_mar_Deva.yaml b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
index 7b04ff4a..3a103e88 100644
--- a/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
+++ b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mar_Deva"
-"description": "A split of Belebele for the mar_Deva language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mar_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
index 49f91a5f..e3a696b4 100644
--- a/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mkd_Cyrl"
-"description": "A split of Belebele for the mkd_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mkd_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
index 1ecc0f38..2067469a 100644
--- a/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mlt_Latn"
-"description": "A split of Belebele for the mlt_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mlt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mri_Latn.yaml b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
index 86c35b59..6cdfb5a3 100644
--- a/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mri_Latn"
-"description": "A split of Belebele for the mri_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mri_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
index ff1c10e8..02a632b8 100644
--- a/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
+++ b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "mya_Mymr"
-"description": "A split of Belebele for the mya_Mymr language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_mya_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_nld_Latn.yaml b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
index f97417c7..e32ebd4d 100644
--- a/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "nld_Latn"
-"description": "A split of Belebele for the nld_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_nld_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nob_Latn.yaml b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
index 62284741..29c690e5 100644
--- a/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "nob_Latn"
-"description": "A split of Belebele for the nob_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_nob_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Deva.yaml b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
index ecd7920b..fe2a8226 100644
--- a/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
+++ b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "npi_Deva"
-"description": "A split of Belebele for the npi_Deva language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_npi_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Latn.yaml b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
index 439730b5..60e08809 100644
--- a/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "npi_Latn"
-"description": "A split of Belebele for the npi_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_npi_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nso_Latn.yaml b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
index 24821280..7029428b 100644
--- a/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "nso_Latn"
-"description": "A split of Belebele for the nso_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_nso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nya_Latn.yaml b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
index 987fa736..b648d75e 100644
--- a/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "nya_Latn"
-"description": "A split of Belebele for the nya_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_nya_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ory_Orya.yaml b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
index 0cbb9bf8..3a55ff01 100644
--- a/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
+++ b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ory_Orya"
-"description": "A split of Belebele for the ory_Orya language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ory_Orya"
diff --git a/lm_eval/tasks/belebele/belebele_pan_Guru.yaml b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
index c266060c..b61bc026 100644
--- a/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
+++ b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "pan_Guru"
-"description": "A split of Belebele for the pan_Guru language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_pan_Guru"
diff --git a/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
index f018adca..4a0631e7 100644
--- a/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "pbt_Arab"
-"description": "A split of Belebele for the pbt_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_pbt_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_pes_Arab.yaml b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
index be1a4678..70b5608c 100644
--- a/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "pes_Arab"
-"description": "A split of Belebele for the pes_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_pes_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_plt_Latn.yaml b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
index ef0ea9c5..ecd13144 100644
--- a/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "plt_Latn"
-"description": "A split of Belebele for the plt_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_plt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_pol_Latn.yaml b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
index 5091fa9a..4c3aedae 100644
--- a/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "pol_Latn"
-"description": "A split of Belebele for the pol_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_pol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_por_Latn.yaml b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
index 4d735f1c..1b4636a8 100644
--- a/lm_eval/tasks/belebele/belebele_por_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "por_Latn"
-"description": "A split of Belebele for the por_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_por_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ron_Latn.yaml b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
index 454b1682..5667ecbc 100644
--- a/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ron_Latn"
-"description": "A split of Belebele for the ron_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ron_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
index 7e2be793..17d3a351 100644
--- a/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "rus_Cyrl"
-"description": "A split of Belebele for the rus_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_rus_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
index 3ebc839f..d19582f5 100644
--- a/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
+++ b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "shn_Mymr"
-"description": "A split of Belebele for the shn_Mymr language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_shn_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Latn.yaml b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
index 05953e39..7b631eac 100644
--- a/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "sin_Latn"
-"description": "A split of Belebele for the sin_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_sin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
index ab802a87..b025120b 100644
--- a/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
+++ b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "sin_Sinh"
-"description": "A split of Belebele for the sin_Sinh language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_sin_Sinh"
diff --git a/lm_eval/tasks/belebele/belebele_slk_Latn.yaml b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
index 023139f8..00a1c163 100644
--- a/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "slk_Latn"
-"description": "A split of Belebele for the slk_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_slk_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_slv_Latn.yaml b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
index 5de85c80..3567fb5d 100644
--- a/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "slv_Latn"
-"description": "A split of Belebele for the slv_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_slv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sna_Latn.yaml b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
index fc624123..e9f01e83 100644
--- a/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "sna_Latn"
-"description": "A split of Belebele for the sna_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_sna_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_snd_Arab.yaml b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
index ce41c40e..af16a289 100644
--- a/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "snd_Arab"
-"description": "A split of Belebele for the snd_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_snd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_som_Latn.yaml b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
index 330c2da3..06aa53c6 100644
--- a/lm_eval/tasks/belebele/belebele_som_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "som_Latn"
-"description": "A split of Belebele for the som_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_som_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sot_Latn.yaml b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
index dcc0f9cc..bb05d3cd 100644
--- a/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "sot_Latn"
-"description": "A split of Belebele for the sot_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_sot_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_spa_Latn.yaml b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
index b86137af..f0ba62ea 100644
--- a/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "spa_Latn"
-"description": "A split of Belebele for the spa_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_spa_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
index 2f4307a3..42a20f6d 100644
--- a/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "srp_Cyrl"
-"description": "A split of Belebele for the srp_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_srp_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
index f83780bd..a3655d7a 100644
--- a/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ssw_Latn"
-"description": "A split of Belebele for the ssw_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ssw_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sun_Latn.yaml b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
index fe41aead..710b87b9 100644
--- a/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "sun_Latn"
-"description": "A split of Belebele for the sun_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_sun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swe_Latn.yaml b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
index 97c0759f..df1e896a 100644
--- a/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "swe_Latn"
-"description": "A split of Belebele for the swe_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_swe_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swh_Latn.yaml b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
index 5ab3464a..0a006b91 100644
--- a/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "swh_Latn"
-"description": "A split of Belebele for the swh_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_swh_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tam_Taml.yaml b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
index 05a84212..0965cbd5 100644
--- a/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
+++ b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tam_Taml"
-"description": "A split of Belebele for the tam_Taml language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tam_Taml"
diff --git a/lm_eval/tasks/belebele/belebele_tel_Telu.yaml b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
index 30c5f89e..4ae5fad4 100644
--- a/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
+++ b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tel_Telu"
-"description": "A split of Belebele for the tel_Telu language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tel_Telu"
diff --git a/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
index 1a68bb92..fd6bf5b6 100644
--- a/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tgk_Cyrl"
-"description": "A split of Belebele for the tgk_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tgk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
index 42a2d8d0..6410484a 100644
--- a/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tgl_Latn"
-"description": "A split of Belebele for the tgl_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tgl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tha_Thai.yaml b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
index 254c046e..c3786ccf 100644
--- a/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
+++ b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tha_Thai"
-"description": "A split of Belebele for the tha_Thai language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tha_Thai"
diff --git a/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
index c456c566..982ebb05 100644
--- a/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
+++ b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tir_Ethi"
-"description": "A split of Belebele for the tir_Ethi language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tir_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
index 77ef814e..026c20f5 100644
--- a/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tsn_Latn"
-"description": "A split of Belebele for the tsn_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tsn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tso_Latn.yaml b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
index cbfebb32..91c75d97 100644
--- a/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tso_Latn"
-"description": "A split of Belebele for the tso_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tur_Latn.yaml b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
index 61a55691..e8f5946b 100644
--- a/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "tur_Latn"
-"description": "A split of Belebele for the tur_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_tur_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
index bf496a6d..1f247407 100644
--- a/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "ukr_Cyrl"
-"description": "A split of Belebele for the ukr_Cyrl language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_ukr_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Arab.yaml b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
index b210f8b6..58a2016d 100644
--- a/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "urd_Arab"
-"description": "A split of Belebele for the urd_Arab language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_urd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Latn.yaml b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
index d48e79e6..a618465b 100644
--- a/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "urd_Latn"
-"description": "A split of Belebele for the urd_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_urd_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
index e45b09e9..4c8c0567 100644
--- a/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "uzn_Latn"
-"description": "A split of Belebele for the uzn_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_uzn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_vie_Latn.yaml b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
index a420a4c9..4c676ad9 100644
--- a/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "vie_Latn"
-"description": "A split of Belebele for the vie_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_vie_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_war_Latn.yaml b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
index 45dfd5fa..4b133e24 100644
--- a/lm_eval/tasks/belebele/belebele_war_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "war_Latn"
-"description": "A split of Belebele for the war_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_war_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_wol_Latn.yaml b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
index 1cac6be5..67b0530c 100644
--- a/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "wol_Latn"
-"description": "A split of Belebele for the wol_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_wol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_xho_Latn.yaml b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
index a1e36894..7665f9b5 100644
--- a/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "xho_Latn"
-"description": "A split of Belebele for the xho_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_xho_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_yor_Latn.yaml b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
index 2de6aecb..e293145f 100644
--- a/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "yor_Latn"
-"description": "A split of Belebele for the yor_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_yor_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
index cdafe7f8..0d5d175a 100644
--- a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "zho_Hans"
-"description": "A split of Belebele for the zho_Hans language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_zho_Hans"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
index 531af9e1..54fedc5d 100644
--- a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "zho_Hant"
-"description": "A split of Belebele for the zho_Hant language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_zho_Hant"
diff --git a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
index b005512f..616bcc0d 100644
--- a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "zsm_Latn"
-"description": "A split of Belebele for the zsm_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_zsm_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
index 04cc5b15..e9da6f4d 100644
--- a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
@@ -1,4 +1,3 @@
 "dataset_name": "zul_Latn"
-"description": "A split of Belebele for the zul_Latn language.\n\n"
 "include": "_default_template_yaml"
 "task": "belebele_zul_Latn"
-- 
GitLab


From 47a7d41c04c359a160d9c0a50f57e1d446e978db Mon Sep 17 00:00:00 2001
From: ManuelFay <manuel.faysse@illuin.tech>
Date: Mon, 25 Sep 2023 23:37:12 +0200
Subject: [PATCH 055/212] match beleble paper prompts more closely

---
 lm_eval/tasks/belebele/README.md              | 4 +++-
 lm_eval/tasks/belebele/_default_template_yaml | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/belebele/README.md b/lm_eval/tasks/belebele/README.md
index 8855c7c6..7b6ab809 100644
--- a/lm_eval/tasks/belebele/README.md
+++ b/lm_eval/tasks/belebele/README.md
@@ -32,7 +32,9 @@ Homepage: https://github.com/facebookresearch/belebele
 
 
 The following tasks evaluate languages in the Belebele dataset using loglikelihood-based multiple-choice scoring:
-- `cmmlu_{language}`
+- `belebele_{language}`
+
+The variant evaluated here is the 0-shot or few-shot evaluation with English Instructions.
 
 ### Checklist
 
diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
index be3cf53b..4dbea664 100644
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -1,5 +1,6 @@
 group: belebele
 dataset_path: facebook/belebele
+description: "Choose the best answer to the question.\n"
 test_split: test
 fewshot_split: test
 fewshot_config:
@@ -7,7 +8,7 @@ fewshot_config:
 output_type: multiple_choice
 should_decontaminate: true
 doc_to_decontamination_query: "{{question}}"
-doc_to_text: "{{question.strip()}}\nA. {{mc_answer1}}\nB. {{mc_answer2}}\nC. {{mc_answer3}}\nD. {{mc_answer4}}\nAnswer："
+doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA. {{mc_answer1}}\nB. {{mc_answer2}}\nC. {{mc_answer3}}\nD. {{mc_answer4}}\nAnswer："
 doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
 metric_list:
-- 
GitLab


From 3f090027112a6e9ea45d7ee2deb57edac5f03ea2 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 26 Sep 2023 03:44:17 +0000
Subject: [PATCH 056/212] moved files

---
 lm_eval/prompts/__init__.py                        |  7 +++++--
 lm_eval/tasks/__init__.py                          | 11 +++++++++--
 lm_eval/tasks/benchmarks/{ => flan}/flan_anli.yaml |  6 +++---
 lm_eval/tasks/benchmarks/flan/flan_arc.yaml        | 14 ++++++++++++++
 .../tasks/benchmarks/{ => flan}/flan_boolq.yaml    |  2 +-
 lm_eval/tasks/benchmarks/{ => flan}/flan_cot.yaml  |  0
 lm_eval/tasks/benchmarks/flan/flan_held_in.yaml    |  6 ++++++
 .../{flan_held_in.yaml => flan/flan_held_in_yaml}  | 14 +++++++-------
 .../tasks/benchmarks/{ => flan}/flan_held_out.yaml |  0
 lm_eval/tasks/benchmarks/{ => flan}/flan_rte.yaml  |  0
 .../prompt_templates/{flan_anli.yaml => anli.yaml} |  0
 .../prompt_templates/{flan_arc.yaml => arc.yaml}   |  0
 .../{flan_boolq.yaml => boolq.yaml}                |  0
 .../prompt_templates/{flan_rte.yaml => rte.yaml}   |  0
 lm_eval/utils.py                                   |  4 +++-
 15 files changed, 48 insertions(+), 16 deletions(-)
 rename lm_eval/tasks/benchmarks/{ => flan}/flan_anli.yaml (70%)
 create mode 100644 lm_eval/tasks/benchmarks/flan/flan_arc.yaml
 rename lm_eval/tasks/benchmarks/{ => flan}/flan_boolq.yaml (74%)
 rename lm_eval/tasks/benchmarks/{ => flan}/flan_cot.yaml (100%)
 create mode 100644 lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
 rename lm_eval/tasks/benchmarks/{flan_held_in.yaml => flan/flan_held_in_yaml} (72%)
 rename lm_eval/tasks/benchmarks/{ => flan}/flan_held_out.yaml (100%)
 rename lm_eval/tasks/benchmarks/{ => flan}/flan_rte.yaml (100%)
 rename lm_eval/tasks/benchmarks/flan/prompt_templates/{flan_anli.yaml => anli.yaml} (100%)
 rename lm_eval/tasks/benchmarks/flan/prompt_templates/{flan_arc.yaml => arc.yaml} (100%)
 rename lm_eval/tasks/benchmarks/flan/prompt_templates/{flan_boolq.yaml => boolq.yaml} (100%)
 rename lm_eval/tasks/benchmarks/flan/prompt_templates/{flan_rte.yaml => rte.yaml} (100%)

diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 9662cd8e..545f3331 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -65,7 +65,7 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
             )
 
 
-def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
+def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, file_dir=None, **kwargs):
 
     category_name, prompt_name = use_prompt.split(":")
 
@@ -84,6 +84,9 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
     elif ".yaml" in category_name:
         import yaml
 
+        if file_dir is not None:
+            category_name = os.path.realpath(os.path.join(file_dir, category_name))
+
         with open(category_name, "rb") as file:
             prompt_yaml_file = yaml.full_load(file)
 
@@ -98,7 +101,7 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
     #     for prompt in prompt_name:
     #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
     # else:
-    prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    #     prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
     return [":".join([category_name, prompt]) for prompt in prompt_list]
 
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index b935d106..3de56fff 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -38,7 +38,7 @@ def register_configurable_task(config: Dict[str, str]) -> int:
     return 0
 
 
-def register_configurable_group(config: Dict[str, str]) -> int:
+def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
     group = config["group"]
     all_task_list = config["task"]
     config_list = [task for task in all_task_list if type(task) != str]
@@ -57,6 +57,7 @@ def register_configurable_group(config: Dict[str, str]) -> int:
         #             **_task["CONFIG"],
         #             **task_config
         #         }
+        task_config = utils.load_yaml_config(yaml_path, task_config)
         var_configs = check_prompt_config(
             {
                 **task_config,
@@ -128,6 +129,10 @@ def include_task_folder(task_dir: str, register_task=True) -> None:
                 try:
                     config = utils.load_yaml_config(yaml_path)
 
+                    # if ("prompts" in config) and (len(config.keys()) == 1):
+
+                    #     continue
+
                     if register_task:
                         all_configs = check_prompt_config(config)
                         for config in all_configs:
@@ -136,9 +141,11 @@ def include_task_folder(task_dir: str, register_task=True) -> None:
                         # If a `task` in config is a list,
                         # that means it's a benchmark
                         if type(config["task"]) == list:
-                            register_configurable_group(config)
+                            register_configurable_group(config, yaml_path)
 
                 except Exception as error:
+                    import traceback
+                    print(traceback.format_exc())
                     eval_logger.warning(
                         "Failed to load config in\n"
                         f"                                 {yaml_path}\n"
diff --git a/lm_eval/tasks/benchmarks/flan_anli.yaml b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
similarity index 70%
rename from lm_eval/tasks/benchmarks/flan_anli.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_anli.yaml
index d6201bb7..ae245c85 100644
--- a/lm_eval/tasks/benchmarks/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
@@ -3,15 +3,15 @@ task:
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r1
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r1
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r2
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r2
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r3
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r3
diff --git a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
new file mode 100644
index 00000000..0e1efe90
--- /dev/null
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
@@ -0,0 +1,14 @@
+group: flan_arc
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: arc_easy
+    dataset_path: ai2_arc
+    dataset_name: ARC-Easy
+    use_prompt: flan/prompt_templates/arc.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: arc_challenge
+    dataset_path: ai2_arc
+    dataset_name: ARC-Challenge
+    use_prompt: flan/prompt_templates/arc.yaml:*
+    validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan_boolq.yaml b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
similarity index 74%
rename from lm_eval/tasks/benchmarks/flan_boolq.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
index f7ca4796..7ba060e7 100644
--- a/lm_eval/tasks/benchmarks/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
@@ -3,5 +3,5 @@ task:
   - include: flan/yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: boolq
-    use_prompt: flan/prompt_templates/flan_boolq.yaml:*
+    use_prompt: flan/prompt_templates/boolq.yaml:*
     validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan_cot.yaml b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan_cot.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_cot.yaml
diff --git a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
new file mode 100644
index 00000000..5465b58c
--- /dev/null
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
@@ -0,0 +1,6 @@
+group: flan_held_in
+task:
+  - flan_boolq
+  - flan_rte
+  - flan_anli
+  - flan_arc
diff --git a/lm_eval/tasks/benchmarks/flan_held_in.yaml b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
similarity index 72%
rename from lm_eval/tasks/benchmarks/flan_held_in.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
index f1965c9d..a31a942e 100644
--- a/lm_eval/tasks/benchmarks/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
@@ -3,37 +3,37 @@ task:
   - include: flan/yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: boolq
-    use_prompt: flan/prompt_templates/flan_boolq.yaml:*
+    use_prompt: flan/prompt_templates/boolq.yaml:*
     validation_split: validation
   - include: flan/yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: rte
-    use_prompt: flan/prompt_templates/flan_rte.yaml:*
+    use_prompt: flan/prompt_templates/rte.yaml:*
     validation_split: validation
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r1
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r1
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r2
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r2
   - include: flan/yaml_templates/held_in_template_yaml
     task: anli_r3
     dataset_path: anli
-    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    use_prompt: flan/prompt_templates/anli.yaml:*
     validation_split: dev_r3
   - include: flan/yaml_templates/held_in_template_yaml
     task: arc_easy
     dataset_path: ai2_arc
     dataset_name: ARC-Easy
-    use_prompt: flan/prompt_templates/flan_arc.yaml:*
+    use_prompt: flan/prompt_templates/arc.yaml:*
     validation_split: validation
   - include: flan/yaml_templates/held_in_template_yaml
     task: arc_challenge
     dataset_path: ai2_arc
     dataset_name: ARC-Challenge
-    use_prompt: flan/prompt_templates/flan_arc.yaml:*
+    use_prompt: flan/prompt_templates/arc.yaml:*
     validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan_held_out.yaml b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan_held_out.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
diff --git a/lm_eval/tasks/benchmarks/flan_rte.yaml b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan_rte.yaml
rename to lm_eval/tasks/benchmarks/flan/flan_rte.yaml
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan/prompt_templates/flan_anli.yaml
rename to lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan/prompt_templates/flan_arc.yaml
rename to lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan/prompt_templates/flan_boolq.yaml
rename to lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
diff --git a/lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml b/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
similarity index 100%
rename from lm_eval/tasks/benchmarks/flan/prompt_templates/flan_rte.yaml
rename to lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 7014a81e..150aa55c 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -426,7 +426,9 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
     if yaml_config is None:
         with open(yaml_path, "rb") as file:
             yaml_config = yaml.full_load(file)
-            yaml_dir = os.path.dirname(yaml_path)
+    
+    if yaml_dir is None:
+        yaml_dir = os.path.dirname(yaml_path)
 
     assert yaml_dir is not None
 
-- 
GitLab


From 307118731f93443f14f490d058db7945492599b3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 26 Sep 2023 15:02:22 +0000
Subject: [PATCH 057/212] modify to conform to pre-commit

---
 lm_eval/tasks/code_x_glue/code-text/bleu.py | 39 +++++++++++++--------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py
index aff16afe..310c626c 100644
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -1,4 +1,12 @@
 #!/usr/bin/python
+import os
+import re
+import sys
+import math
+import subprocess
+import xml.sax.saxutils
+
+from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
 
 """
 This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
@@ -17,17 +25,13 @@ score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU sco
 The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
 """
 
-import sys, math, re, xml.sax.saxutils
-import subprocess
-import os
-
 # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
 nonorm = 0
 
 preserve_case = False
 eff_ref_len = "shortest"
 
-normalize1 = [
+normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
     ("<skipped>", ""),  # strip "skipped" tags
     (r"-\n", ""),  # strip end-of-line hyphenation and join lines
     (r"\n", " "),  # join lines
@@ -35,7 +39,7 @@ normalize1 = [
 ]
 normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
 
-normalize2 = [
+normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
     (
         r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
         r" \1 ",
@@ -74,7 +78,7 @@ def normalize(s):
 
 
 def count_ngrams(words, n=4):
-    counts = {}
+    counts: Dict[Any, int] = {}
     for k in range(1, n + 1):
         for i in range(len(words) - k + 1):
             ngram = tuple(words[i : i + k])
@@ -88,7 +92,7 @@ def cook_refs(refs, n=4):
     needs to know about them."""
 
     refs = [normalize(ref) for ref in refs]
-    maxcounts = {}
+    maxcounts: Dict[Tuple[str], int] = {}
     for ref in refs:
         counts = count_ngrams(ref, n)
         for (ngram, count) in counts.items():
@@ -101,7 +105,7 @@ def cook_test(test, item, n=4):
     encapsulates everything that BLEU needs to know about it."""
     (reflens, refmaxcounts) = item
     test = normalize(test)
-    result = {}
+    result: Dict[str, Any] = {}
     result["testlen"] = len(test)
 
     # Calculate effective reference sentence length.
@@ -111,7 +115,7 @@ def cook_test(test, item, n=4):
     elif eff_ref_len == "average":
         result["reflen"] = float(sum(reflens)) / len(reflens)
     elif eff_ref_len == "closest":
-        min_diff = None
+        min_diff: Optional[int] = None
         for reflen in reflens:
             if min_diff is None or abs(reflen - len(test)) < min_diff:
                 min_diff = abs(reflen - len(test))
@@ -128,7 +132,12 @@ def cook_test(test, item, n=4):
 
 
 def score_cooked(allcomps, n=4, ground=0, smooth=1):
-    totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n}
+    totalcomps: Dict[str, Any] = {
+        "testlen": 0,
+        "reflen": 0,
+        "guess": [0] * n,
+        "correct": [0] * n,
+    }
     for comps in allcomps:
         for key in ["testlen", "reflen"]:
             totalcomps[key] += comps[key]
@@ -136,7 +145,7 @@ def score_cooked(allcomps, n=4, ground=0, smooth=1):
             for k in range(n):
                 totalcomps[key][k] += comps[key][k]
     logbleu = 0.0
-    all_bleus = []
+    all_bleus: List[float] = []
     for k in range(n):
         correct = totalcomps["correct"][k]
         guess = totalcomps["guess"][k]
@@ -147,7 +156,7 @@ def score_cooked(allcomps, n=4, ground=0, smooth=1):
             guess + addsmooth + sys.float_info.min
         )
         if guess == 0:
-            all_bleus.append(-10000000)
+            all_bleus.append(-10000000.0)
         else:
             all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
 
@@ -175,8 +184,8 @@ def splitPuncts(line):
 
 
 def computeMaps(predictions, goldfile):
-    predictionMap = {}
-    goldMap = {}
+    predictionMap: Dict[str, list] = {}
+    goldMap: Dict[str, list] = {}
     gf = open(goldfile, "r")
 
     for row in predictions:
-- 
GitLab


From b2d16321724b572967bc7f1cbe64ae0de4eaa0dd Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 26 Sep 2023 15:06:35 +0000
Subject: [PATCH 058/212] update loading prompts

---
 lm_eval/prompts/__init__.py                   | 11 +--
 lm_eval/tasks/__init__.py                     | 71 +++++++++++--------
 lm_eval/tasks/bbh/_generate_configs.py        |  2 +-
 lm_eval/tasks/benchmarks/flan/flan_anli.yaml  | 12 ++--
 lm_eval/tasks/benchmarks/flan/flan_arc.yaml   |  8 +--
 lm_eval/tasks/benchmarks/flan/flan_boolq.yaml |  4 +-
 lm_eval/tasks/benchmarks/flan/flan_cot.yaml   |  4 +-
 lm_eval/tasks/benchmarks/flan/flan_rte.yaml   |  4 +-
 .../paws-x/{utils.py => _generate_config.py}  |  0
 lm_eval/utils.py                              |  2 +-
 10 files changed, 66 insertions(+), 52 deletions(-)
 rename lm_eval/tasks/paws-x/{utils.py => _generate_config.py} (100%)

diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 545f3331..68eeac6c 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -1,3 +1,4 @@
+import os
 import ast
 
 from typing import Dict
@@ -65,7 +66,9 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
             )
 
 
-def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, file_dir=None, **kwargs):
+def load_prompt_list(
+    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
+):
 
     category_name, prompt_name = use_prompt.split(":")
 
@@ -84,8 +87,8 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, file_
     elif ".yaml" in category_name:
         import yaml
 
-        if file_dir is not None:
-            category_name = os.path.realpath(os.path.join(file_dir, category_name))
+        if yaml_path is not None:
+            category_name = os.path.realpath(os.path.join(yaml_path, category_name))
 
         with open(category_name, "rb") as file:
             prompt_yaml_file = yaml.full_load(file)
@@ -94,7 +97,7 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, file_
             prompt_name, prompt_yaml_file["prompts"].keys()
         )
 
-    category_name, *prompt_name = use_prompt.split(":")
+    # category_name, *prompt_name = use_prompt.split(":")
     # TODO allow to multiple prompt naming
     # if len(prompt_name) > 1:
     #     prompt_list = []
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 3de56fff..f0923f09 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -45,24 +45,25 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
     task_list = [task for task in all_task_list if type(task) == str]
 
     for task_config in config_list:
-        # if "task" in task_config:
-        #     task = task_config["task"]
-        #     if task in GROUP_REGISTRY:
-        #         task_list = GROUP_REGISTRY[task]
-        #     elif task in TASK_REGISTRY:
-        #         task_list = [TASK_REGISTRY[task]]
-        
-        #     for _task in task_list:
-        #         task_config = {
-        #             **_task["CONFIG"],
-        #             **task_config
-        #         }
+        # assert "task" in task_config:
+        # task = task_config["task"]
+        # if task in GROUP_REGISTRY:
+        #     task_list = GROUP_REGISTRY[task]
+        # elif task in TASK_REGISTRY:
+        #     task_list = [TASK_REGISTRY[task]]
+
+        # for _task in task_list:
+        #     task_config = {
+        #         **_task["CONFIG"],
+        #         **task_config
+        #     }
         task_config = utils.load_yaml_config(yaml_path, task_config)
         var_configs = check_prompt_config(
             {
                 **task_config,
                 **{"group": group},
-            }
+            },
+            yaml_path=os.path.dirname(yaml_path),
         )
         for config in var_configs:
             register_configurable_task(config)
@@ -79,13 +80,16 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
     return 0
 
 
-def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
+def check_prompt_config(
+    config: Dict[str, str], yaml_path: str = None
+) -> List[Dict[str, str]]:
     all_configs = []
     if "use_prompt" in config:
         prompt_list = prompts.load_prompt_list(
             use_prompt=config["use_prompt"],
             dataset_name=config["dataset_path"],
             subset_name=config["dataset_name"] if "dataset_name" in config else None,
+            yaml_path=yaml_path,
         )
         for idx, prompt_variation in enumerate(prompt_list):
             all_configs.append(
@@ -98,7 +102,9 @@ def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
                                 config["task"]
                                 if "task" in config
                                 else get_task_name_from_config(config),
-                                prompt_variation,
+                                prompt_variation.split("/")[-1]
+                                if ".yaml" in prompt_variation
+                                else prompt_variation,
                             ]
                         )
                     },
@@ -117,7 +123,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
         return "{dataset_path}".format(**task_config)
 
 
-def include_task_folder(task_dir: str, register_task=True) -> None:
+def include_task_folder(task_dir: str, register_task: bool = True) -> None:
     """
     Calling this function
     """
@@ -129,29 +135,33 @@ def include_task_folder(task_dir: str, register_task=True) -> None:
                 try:
                     config = utils.load_yaml_config(yaml_path)
 
-                    # if ("prompts" in config) and (len(config.keys()) == 1):
-
-                    #     continue
+                    if "task" not in config:
+                        continue
 
-                    if register_task:
-                        all_configs = check_prompt_config(config)
-                        for config in all_configs:
-                            register_configurable_task(config)
-                    else:
-                        # If a `task` in config is a list,
-                        # that means it's a benchmark
-                        if type(config["task"]) == list:
-                            register_configurable_group(config, yaml_path)
+                    all_configs = check_prompt_config(
+                        config, yaml_path=os.path.dirname(yaml_path)
+                    )
+                    for config in all_configs:
+                        if register_task:
+                            if type(config["task"]) == str:
+                                register_configurable_task(config)
+                        else:
+                            if type(config["task"]) == list:
+                                register_configurable_group(config, yaml_path)
 
                 except Exception as error:
                     import traceback
-                    print(traceback.format_exc())
+
+                    print("###")
+                    print(yaml_path)
                     eval_logger.warning(
                         "Failed to load config in\n"
                         f"                                 {yaml_path}\n"
                         "                                 Config will not be added to registry\n"
-                        f"                                 Error: {error}"
+                        f"                                 Error: {error}\n"
+                        f"                                 Traceback: {traceback.format_exc()}"
                     )
+    return 0
 
 
 def include_path(task_dir):
@@ -160,6 +170,7 @@ def include_path(task_dir):
     include_task_folder(task_dir, register_task=False)
     return 0
 
+
 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
 include_path(task_dir)
 
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index 0c882af0..1f528722 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -54,7 +54,7 @@ if __name__ == "__main__":
                     shot = "Q:" + shot
                     try:
                         answer = answer_regex.search(shot)[0]
-                    except:
+                    except Exception:
                         print("task", task)
                         print(shot)
                     example = shot.split("Let's think step by step.")[0]
diff --git a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
index ae245c85..21278e1f 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
@@ -1,17 +1,17 @@
 group: flan_anli
 task:
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     task: anli_r1
     dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
+    use_prompt: prompt_templates/anli.yaml:*
     validation_split: dev_r1
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     task: anli_r2
     dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
+    use_prompt: prompt_templates/anli.yaml:*
     validation_split: dev_r2
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     task: anli_r3
     dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
+    use_prompt: prompt_templates/anli.yaml:*
     validation_split: dev_r3
diff --git a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
index 0e1efe90..4e73b7ce 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
@@ -1,14 +1,14 @@
 group: flan_arc
 task:
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     task: arc_easy
     dataset_path: ai2_arc
     dataset_name: ARC-Easy
-    use_prompt: flan/prompt_templates/arc.yaml:*
+    use_prompt: prompt_templates/arc.yaml:*
     validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     task: arc_challenge
     dataset_path: ai2_arc
     dataset_name: ARC-Challenge
-    use_prompt: flan/prompt_templates/arc.yaml:*
+    use_prompt: prompt_templates/arc.yaml:*
     validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
index 7ba060e7..8fe36cd5 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
@@ -1,7 +1,7 @@
 group: flan_boolq
 task:
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: boolq
-    use_prompt: flan/prompt_templates/boolq.yaml:*
+    use_prompt: prompt_templates/boolq.yaml:*
     validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
index ff6edc24..10102d24 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
@@ -1,11 +1,11 @@
 group: flan_cot
 task:
-  - include: flan/yaml_templates/cot_template_yaml
+  - include: yaml_templates/cot_template_yaml
     dataset_path: gsmk
     dataset_name: boolq
     use_prompt: promptsource:*
     validation_split: validation
-  - include: flan/yaml_templates/cot_template_yaml
+  - include: yaml_templates/cot_template_yaml
     dataset_path: EleutherAI/asdiv
     use_prompt: promptsource:*
     validation_split: validation
diff --git a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
index cf5832bf..a4f40736 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
@@ -1,7 +1,7 @@
 group: flan_rte
 task:
-  - include: flan/yaml_templates/held_in_template_yaml
+  - include: yaml_templates/held_in_template_yaml
     dataset_path: super_glue
     dataset_name: rte
-    use_prompt: flan/prompt_templates/flan_rte.yaml:*
+    use_prompt: prompt_templates/rte.yaml:*
     validation_split: validation
diff --git a/lm_eval/tasks/paws-x/utils.py b/lm_eval/tasks/paws-x/_generate_config.py
similarity index 100%
rename from lm_eval/tasks/paws-x/utils.py
rename to lm_eval/tasks/paws-x/_generate_config.py
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 150aa55c..356fdf7b 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -426,7 +426,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
     if yaml_config is None:
         with open(yaml_path, "rb") as file:
             yaml_config = yaml.full_load(file)
-    
+
     if yaml_dir is None:
         yaml_dir = os.path.dirname(yaml_path)
 
-- 
GitLab


From 6ae9e9e80c1e0049237d0179ac94fd57bc49029b Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 26 Sep 2023 15:36:48 +0000
Subject: [PATCH 059/212] remove comments

---
 lm_eval/tasks/__init__.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index f0923f09..64dd4fdb 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -45,18 +45,7 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
     task_list = [task for task in all_task_list if type(task) == str]
 
     for task_config in config_list:
-        # assert "task" in task_config:
-        # task = task_config["task"]
-        # if task in GROUP_REGISTRY:
-        #     task_list = GROUP_REGISTRY[task]
-        # elif task in TASK_REGISTRY:
-        #     task_list = [TASK_REGISTRY[task]]
-
-        # for _task in task_list:
-        #     task_config = {
-        #         **_task["CONFIG"],
-        #         **task_config
-        #     }
+
         task_config = utils.load_yaml_config(yaml_path, task_config)
         var_configs = check_prompt_config(
             {
-- 
GitLab


From 3d2ee4d43e25a069aec30ccbdff17861deed55e1 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 26 Sep 2023 12:10:04 -0400
Subject: [PATCH 060/212] Update _default_template_yaml

---
 lm_eval/tasks/mmlu/default/_default_template_yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index 1064a2d1..bd989c40 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -2,6 +2,8 @@ group: mmlu
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
+fewshot_config:
+  sampler: first_n
 output_type: multiple_choice
 doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 doc_to_choice: ["A", "B", "C", "D"]
-- 
GitLab


From 5c418e2e7d333b2c581dec88391e9d46312fb917 Mon Sep 17 00:00:00 2001
From: Tanishq Abraham <37097934+tmabraham@users.noreply.github.com>
Date: Fri, 29 Sep 2023 03:31:20 -0700
Subject: [PATCH 061/212] Update pubmedqa.yaml

---
 lm_eval/tasks/pubmedqa/pubmedqa.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lm_eval/tasks/pubmedqa/pubmedqa.yaml b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
index ae5e0657..a182521e 100644
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -1,10 +1,10 @@
 task: pubmed_qa
-dataset_path: pubmed_qa
-dataset_name: pqa_labeled
+dataset_path: bigbio/pubmed_qa
+dataset_name: pubmed_qa_labeled_fold0_source
 output_type: multiple_choice
-training_split: null
-validation_split: null
-test_split: train
+training_split: train
+validation_split: validation
+test_split: test
 doc_to_text: !function preprocess_pubmedqa.doc_to_text
 doc_to_target: final_decision
 doc_to_choice: ["yes", "no", "maybe"]
-- 
GitLab


From 6658f510321c97f33b0fb08e5ddaf136d12bfff2 Mon Sep 17 00:00:00 2001
From: Tanishq Abraham <37097934+tmabraham@users.noreply.github.com>
Date: Fri, 29 Sep 2023 03:31:33 -0700
Subject: [PATCH 062/212] Update preprocess_pubmedqa.py

---
 lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
index 095e58b2..516f0e2f 100644
--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -1,7 +1,7 @@
 def doc_to_text(doc) -> str:
-    ctxs = "\n".join(doc["context"]["contexts"])
+    ctxs = "\n".join(doc["CONTEXTS"])
     return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-        ctxs, doc["question"], doc["final_decision"]
+        ctxs, doc["QUESTION"], doc["final_decision"]
     )
 
 
-- 
GitLab


From e634f83fe931108d080936ee2b17f878fa3f1ba6 Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Mon, 2 Oct 2023 16:25:32 +0200
Subject: [PATCH 063/212] Add PolEmo2 tasks

---
 lm_eval/tasks/polemo2/README.md        | 57 ++++++++++++++++++++++++++
 lm_eval/tasks/polemo2/polemo2_in.yaml  | 43 +++++++++++++++++++
 lm_eval/tasks/polemo2/polemo2_out.yaml |  4 ++
 3 files changed, 104 insertions(+)
 create mode 100644 lm_eval/tasks/polemo2/README.md
 create mode 100644 lm_eval/tasks/polemo2/polemo2_in.yaml
 create mode 100644 lm_eval/tasks/polemo2/polemo2_out.yaml

diff --git a/lm_eval/tasks/polemo2/README.md b/lm_eval/tasks/polemo2/README.md
new file mode 100644
index 00000000..5d48ece2
--- /dev/null
+++ b/lm_eval/tasks/polemo2/README.md
@@ -0,0 +1,57 @@
+# PolEmo 2.0 
+
+### Paper
+
+Title: `Multi-Level Sentiment Analysis of PolEmo 2.0: Extended Corpus of Multi-Domain Consumer Reviews`
+
+Abstract: https://aclanthology.org/K19-1092/
+
+The PolEmo 2.0 is a dataset of online consumer reviews in Polish from four domains: medicine, hotels, products, and university. It is human-annotated on a level of full reviews and individual sentences. It comprises over 8000 reviews, about 85% from the medicine and hotel domains.
+The goal is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.
+
+Homepage: https://clarin-pl.eu/dspace/handle/11321/710
+
+
+### Citation
+
+```
+@inproceedings{kocon-etal-2019-multi,
+    title = "Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews",
+    author = "Koco{\'n}, Jan  and
+      Mi{\l}kowski, Piotr  and
+      Za{\'s}ko-Zieli{\'n}ska, Monika",
+    booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
+    month = nov,
+    year = "2019",
+    address = "Hong Kong, China",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/K19-1092",
+    doi = "10.18653/v1/K19-1092",
+    pages = "980--991",
+    abstract = "In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `polemo2`: Evaluates `polemo2_in` and `polemo2_out`
+
+#### Tasks
+
+* `polemo2_in`: evaluates sentiment predictions of in-domain (medicine and hotels) reviews
+* `polemo2_out`: evaluates sentiment predictions of out-of-domain (products and university) reviews
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml
new file mode 100644
index 00000000..4c2250f8
--- /dev/null
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -0,0 +1,43 @@
+group:
+  - polemo2
+task: polemo2_in
+dataset_path: allegro/klej-polemo2-in
+dataset_name: klej-polemo2-in
+output_type: greedy_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Opinia: \"{{sentence}}\"\nOkreśl sentyment podanej opinii. Możliwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawidłowa odpowiedź:"
+doc_to_target: "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{sentence}}"
+generation_kwargs:
+  until:
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 50
+filter_list:
+  - name: "score-first"
+    filter:
+      - function: "regex"
+        regex_pattern: "(\\b[ABCD]\\b)"
+      - function: "take_first"
+      - function: "map"
+        mapping_dict:
+          A: 0
+          B: 1
+          C: 2
+          D: 3
+        default_value: -1
+      - function: "take_first"
+metric_list:
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+    hf_evaluate: true
+    average: micro
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/polemo2/polemo2_out.yaml b/lm_eval/tasks/polemo2/polemo2_out.yaml
new file mode 100644
index 00000000..a815a780
--- /dev/null
+++ b/lm_eval/tasks/polemo2/polemo2_out.yaml
@@ -0,0 +1,4 @@
+include: polemo2_in.yaml
+task: polemo2_out
+dataset_path: allegro/klej-polemo2-out
+dataset_name: klej-polemo2-out
-- 
GitLab


From 06ce7a62dece2427ede1ea34b6aad77b7ac5cbd4 Mon Sep 17 00:00:00 2001
From: Chris <chris@azurro.pl>
Date: Mon, 2 Oct 2023 16:33:25 +0200
Subject: [PATCH 064/212] Fix formatting

---
 lm_eval/tasks/polemo2/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/polemo2/README.md b/lm_eval/tasks/polemo2/README.md
index 5d48ece2..837c704d 100644
--- a/lm_eval/tasks/polemo2/README.md
+++ b/lm_eval/tasks/polemo2/README.md
@@ -1,4 +1,4 @@
-# PolEmo 2.0 
+# PolEmo 2.0
 
 ### Paper
 
-- 
GitLab


From 3a7f52d033e72eb9bd34ce6aa8ff48090893d9e2 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:31:24 -0400
Subject: [PATCH 065/212] Bring task name to `pubmedqa` in line with master
 branch

---
 lm_eval/tasks/pubmedqa/pubmedqa.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/pubmedqa/pubmedqa.yaml b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
index a182521e..6903aba4 100644
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -1,4 +1,4 @@
-task: pubmed_qa
+task: pubmedqa
 dataset_path: bigbio/pubmed_qa
 dataset_name: pubmed_qa_labeled_fold0_source
 output_type: multiple_choice
-- 
GitLab


From 1a77b4d54e84a7b9a49db2bbc7325ede487c9d00 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 3 Oct 2023 20:12:53 +0000
Subject: [PATCH 066/212] update generate_tasks.py and template yamls

---
 lm_eval/tasks/bigbench/generate_tasks.py             | 2 +-
 lm_eval/tasks/bigbench/greedy_until_template_yaml    | 5 ++---
 lm_eval/tasks/bigbench/multiple_choice_template_yaml | 9 ++++++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index fa68190e..dbd7a959 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -185,7 +185,7 @@ def main() -> None:
                         {
                             "include": f"../{task_type}",
                             "task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]),
-                            "dataset_name": task,
+                            "dataset_name": task + "_zero_shot", # zero-shot version of the dataset
                         },
                         f,
                         width=float("inf"), allow_unicode=True
diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
index 1d4e492b..db975306 100644
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
@@ -1,8 +1,7 @@
 group: bigbench
-dataset_path: bigbench
+dataset_path: hails/bigbench
 output_type: greedy_until
-training_split: train
-validation_split: validation
+test_split: default
 doc_to_text: inputs
 doc_to_target: "{{targets[0]}}"
 generation_kwargs:
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
index 3dd2af61..6211f5b3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -1,10 +1,13 @@
 group: bigbench
-dataset_path: bigbench
+dataset_path: hails/bigbench
+dataset_kwargs:
+  num_shots: 0 # TODO: num of shots should be controlled through this, not through the typical methods
+  # subtask_name: null
 output_type: multiple_choice
-training_split: train
-validation_split: validation
+test_split: default
 doc_to_text: inputs
 doc_to_target: "{{multiple_choice_targets.index(targets[0])}}"
 doc_to_choice: "{{multiple_choice_targets}}"
 metric_list:
   - metric: acc
+  # TODO: brier score and other metrics
-- 
GitLab


From 8806944e8790170b1b5429af83ceeac63c961660 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 3 Oct 2023 20:13:21 +0000
Subject: [PATCH 067/212] push updated bigbench task yamls

---
 .../bigbench/greedy_until/abstract_narrative_understanding.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml             | 2 +-
 lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml | 2 +-
 .../tasks/bigbench/greedy_until/authorship_verification.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml          | 2 +-
 .../greedy_until/bridging_anaphora_resolution_barqa.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml   | 2 +-
 .../tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/codenames.yaml              | 2 +-
 lm_eval/tasks/bigbench/greedy_until/color.yaml                  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml        | 2 +-
 .../tasks/bigbench/greedy_until/conceptual_combinations.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml    | 2 +-
 .../greedy_until/contextual_parametric_knowledge_conflicts.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml          | 2 +-
 lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml             | 2 +-
 lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml          | 2 +-
 lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml      | 2 +-
 .../bigbench/greedy_until/discourse_marker_prediction.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml            | 2 +-
 .../tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml       | 2 +-
 .../tasks/bigbench/greedy_until/english_russian_proverbs.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml      | 2 +-
 .../tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml    | 2 +-
 .../greedy_until/evaluating_information_essentiality.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml           | 2 +-
 .../tasks/bigbench/greedy_until/figure_of_speech_detection.yaml | 2 +-
 .../greedy_until/formal_fallacies_syllogisms_negation.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/gem.yaml                    | 2 +-
 .../greedy_until/gender_inclusive_sentences_german.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml      | 2 +-
 .../tasks/bigbench/greedy_until/gre_reading_comprehension.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml          | 2 +-
 .../tasks/bigbench/greedy_until/hindi_question_answering.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml             | 2 +-
 lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/implicatures.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml     | 2 +-
 .../greedy_until/international_phonetic_alphabet_nli.yaml       | 2 +-
 .../international_phonetic_alphabet_transliterate.yaml          | 2 +-
 lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml            | 2 +-
 lm_eval/tasks/bigbench/greedy_until/kannada.yaml                | 2 +-
 lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/language_games.yaml         | 2 +-
 .../tasks/bigbench/greedy_until/language_identification.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/list_functions.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/logical_args.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml      | 2 +-
 .../tasks/bigbench/greedy_until/logical_fallacy_detection.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml            | 2 +-
 lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml   | 2 +-
 .../bigbench/greedy_until/movie_dialog_same_or_different.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/multiemo.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/navigate.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/object_counting.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml            | 2 +-
 lm_eval/tasks/bigbench/greedy_until/operators.yaml              | 2 +-
 lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml            | 2 +-
 .../bigbench/greedy_until/parsinlu_reading_comprehension.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/physics.yaml                | 2 +-
 lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml      | 2 +-
 .../bigbench/greedy_until/play_dialog_same_or_different.yaml    | 2 +-
 .../tasks/bigbench/greedy_until/polish_sequence_labeling.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml            | 2 +-
 lm_eval/tasks/bigbench/greedy_until/question_selection.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml      | 2 +-
 .../bigbench/greedy_until/reasoning_about_colored_objects.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml      | 2 +-
 lm_eval/tasks/bigbench/greedy_until/rephrase.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml             | 2 +-
 .../greedy_until/salient_translation_error_detection.yaml       | 2 +-
 .../tasks/bigbench/greedy_until/scientific_press_release.yaml   | 2 +-
 .../greedy_until/semantic_parsing_in_context_sparc.yaml         | 2 +-
 .../tasks/bigbench/greedy_until/semantic_parsing_spider.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml     | 2 +-
 .../tasks/bigbench/greedy_until/similarities_abstraction.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml | 2 +-
 .../greedy_until/simple_arithmetic_json_multiple_choice.yaml    | 2 +-
 .../bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml  | 2 +-
 .../greedy_until/simple_arithmetic_multiple_targets_json.yaml   | 2 +-
 .../tasks/bigbench/greedy_until/simple_ethical_questions.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml    | 2 +-
 lm_eval/tasks/bigbench/greedy_until/snarks.yaml                 | 2 +-
 lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml             | 2 +-
 lm_eval/tasks/bigbench/greedy_until/social_support.yaml         | 2 +-
 lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml             | 2 +-
 lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml           | 2 +-
 .../tasks/bigbench/greedy_until/swahili_english_proverbs.yaml   | 2 +-
 .../tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml     | 2 +-
 lm_eval/tasks/bigbench/greedy_until/tense.yaml                  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/timedial.yaml               | 2 +-
 lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml           | 2 +-
 .../tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml  | 2 +-
 lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml   | 2 +-
 lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml       | 2 +-
 lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml    | 2 +-
 .../bigbench/greedy_until/unnatural_in_context_learning.yaml    | 2 +-
 .../tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml | 2 +-
 lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml        | 2 +-
 lm_eval/tasks/bigbench/greedy_until/winowhy.yaml                | 2 +-
 lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml           | 2 +-
 lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml      | 2 +-
 .../multiple_choice/abstract_narrative_understanding.yaml       | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml        | 2 +-
 .../tasks/bigbench/multiple_choice/analogical_similarity.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml          | 2 +-
 .../tasks/bigbench/multiple_choice/ascii_word_recognition.yaml  | 2 +-
 .../tasks/bigbench/multiple_choice/authorship_verification.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml       | 2 +-
 .../multiple_choice/bridging_anaphora_resolution_barqa.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml    | 2 +-
 .../tasks/bigbench/multiple_choice/chess_state_tracking.yaml    | 2 +-
 .../bigbench/multiple_choice/chinese_remainder_theorem.yaml     | 2 +-
 .../tasks/bigbench/multiple_choice/cifar10_classification.yaml  | 2 +-
 .../tasks/bigbench/multiple_choice/code_line_description.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/codenames.yaml           | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/color.yaml               | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml     | 2 +-
 .../tasks/bigbench/multiple_choice/conceptual_combinations.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml | 2 +-
 .../contextual_parametric_knowledge_conflicts.yaml              | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml       | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml            | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml          | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml       | 2 +-
 .../tasks/bigbench/multiple_choice/dark_humor_detection.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml   | 2 +-
 .../bigbench/multiple_choice/discourse_marker_prediction.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml            | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml         | 2 +-
 .../bigbench/multiple_choice/emojis_emotion_prediction.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml    | 2 +-
 .../bigbench/multiple_choice/english_russian_proverbs.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml   | 2 +-
 .../tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml | 2 +-
 .../multiple_choice/evaluating_information_essentiality.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml        | 2 +-
 .../bigbench/multiple_choice/figure_of_speech_detection.yaml    | 2 +-
 .../multiple_choice/formal_fallacies_syllogisms_negation.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/gem.yaml                 | 2 +-
 .../multiple_choice/gender_inclusive_sentences_german.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml   | 2 +-
 .../bigbench/multiple_choice/gre_reading_comprehension.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml       | 2 +-
 .../bigbench/multiple_choice/hindi_question_answering.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml          | 2 +-
 .../tasks/bigbench/multiple_choice/identify_math_theorems.yaml  | 2 +-
 .../tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml  | 2 +-
 .../multiple_choice/international_phonetic_alphabet_nli.yaml    | 2 +-
 .../international_phonetic_alphabet_transliterate.yaml          | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml  | 2 +-
 .../tasks/bigbench/multiple_choice/irony_identification.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml         | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/kannada.yaml             | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/language_games.yaml      | 2 +-
 .../tasks/bigbench/multiple_choice/language_identification.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml   | 2 +-
 .../bigbench/multiple_choice/logical_fallacy_detection.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml    | 2 +-
 .../tasks/bigbench/multiple_choice/mathematical_induction.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml    | 2 +-
 .../tasks/bigbench/multiple_choice/metaphor_understanding.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml      | 2 +-
 .../tasks/bigbench/multiple_choice/misconceptions_russian.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml         | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml | 2 +-
 .../tasks/bigbench/multiple_choice/moral_permissibility.yaml    | 2 +-
 .../multiple_choice/movie_dialog_same_or_different.yaml         | 2 +-
 .../tasks/bigbench/multiple_choice/movie_recommendation.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml            | 2 +-
 .../tasks/bigbench/multiple_choice/natural_instructions.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/navigate.yaml            | 2 +-
 .../tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml         | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/operators.yaml           | 2 +-
 .../tasks/bigbench/multiple_choice/paragraph_segmentation.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml         | 2 +-
 .../multiple_choice/parsinlu_reading_comprehension.yaml         | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/physics.yaml             | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml   | 2 +-
 .../bigbench/multiple_choice/play_dialog_same_or_different.yaml | 2 +-
 .../bigbench/multiple_choice/polish_sequence_labeling.yaml      | 2 +-
 .../tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml         | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml   | 2 +-
 .../multiple_choice/reasoning_about_colored_objects.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml            | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml          | 2 +-
 .../multiple_choice/salient_translation_error_detection.yaml    | 2 +-
 .../bigbench/multiple_choice/scientific_press_release.yaml      | 2 +-
 .../multiple_choice/semantic_parsing_in_context_sparc.yaml      | 2 +-
 .../tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml  | 2 +-
 .../bigbench/multiple_choice/similarities_abstraction.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml | 2 +-
 .../tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml  | 2 +-
 .../multiple_choice/simple_arithmetic_json_multiple_choice.yaml | 2 +-
 .../multiple_choice/simple_arithmetic_json_subtasks.yaml        | 2 +-
 .../simple_arithmetic_multiple_targets_json.yaml                | 2 +-
 .../bigbench/multiple_choice/simple_ethical_questions.yaml      | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/snarks.yaml              | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml          | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/social_support.yaml      | 2 +-
 .../tasks/bigbench/multiple_choice/sports_understanding.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml          | 2 +-
 .../tasks/bigbench/multiple_choice/sufficient_information.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml        | 2 +-
 .../bigbench/multiple_choice/swahili_english_proverbs.yaml      | 2 +-
 .../bigbench/multiple_choice/swedish_to_german_proverbs.yaml    | 2 +-
 .../tasks/bigbench/multiple_choice/symbol_interpretation.yaml   | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml  | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/tense.yaml               | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/timedial.yaml            | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml        | 2 +-
 .../bigbench/multiple_choice/tracking_shuffled_objects.yaml     | 2 +-
 .../tasks/bigbench/multiple_choice/understanding_fables.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml | 2 +-
 .../bigbench/multiple_choice/unnatural_in_context_learning.yaml | 2 +-
 .../bigbench/multiple_choice/vitaminc_fact_verification.yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml     | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml             | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml        | 2 +-
 lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml   | 2 +-
 334 files changed, 334 insertions(+), 334 deletions(-)

diff --git a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
index 462d1be9..dd041fdd 100644
--- a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: abstract_narrative_understanding
+dataset_name: abstract_narrative_understanding_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_abstract_narrative_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
index d62133a0..9e723927 100644
--- a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: anachronisms
+dataset_name: anachronisms_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_anachronisms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
index 2fedcd91..3d2e82b4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: analogical_similarity
+dataset_name: analogical_similarity_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_analogical_similarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
index 58de1bcf..a8425049 100644
--- a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: analytic_entailment
+dataset_name: analytic_entailment_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_analytic_entailment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
index 6be6a787..be296b1b 100644
--- a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: arithmetic
+dataset_name: arithmetic_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
index 0461605a..d199e8a5 100644
--- a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: ascii_word_recognition
+dataset_name: ascii_word_recognition_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_ascii_word_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
index dbfa2103..65d8177c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: authorship_verification
+dataset_name: authorship_verification_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_authorship_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
index 9ab1545e..3ce36427 100644
--- a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: auto_categorization
+dataset_name: auto_categorization_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_auto_categorization_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
index e8a491c0..e25bee24 100644
--- a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: auto_debugging
+dataset_name: auto_debugging_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_auto_debugging_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
index 8b97ba0a..d1d45477 100644
--- a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: bbq_lite_json
+dataset_name: bbq_lite_json_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_bbq_lite_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
index 618d8dd3..a20da27f 100644
--- a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: bridging_anaphora_resolution_barqa
+dataset_name: bridging_anaphora_resolution_barqa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_bridging_anaphora_resolution_barqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
index 687d59ba..2b9c89af 100644
--- a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: causal_judgment
+dataset_name: causal_judgment_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_causal_judgment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
index a1f20264..5dd23108 100644
--- a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cause_and_effect
+dataset_name: cause_and_effect_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_cause_and_effect_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
index 4089a228..06681769 100644
--- a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: checkmate_in_one
+dataset_name: checkmate_in_one_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_checkmate_in_one_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
index 727e7879..6a9a088e 100644
--- a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: chess_state_tracking
+dataset_name: chess_state_tracking_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_chess_state_tracking_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
index 6af0bcbf..f3937088 100644
--- a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: chinese_remainder_theorem
+dataset_name: chinese_remainder_theorem_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_chinese_remainder_theorem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
index 3e0bf92c..6bad6797 100644
--- a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cifar10_classification
+dataset_name: cifar10_classification_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_cifar10_classification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
index 624ab362..de1f7829 100644
--- a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: code_line_description
+dataset_name: code_line_description_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_code_line_description_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
index 6ea8f12e..83feca88 100644
--- a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: codenames
+dataset_name: codenames_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_codenames_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/color.yaml b/lm_eval/tasks/bigbench/greedy_until/color.yaml
index 4ae393fd..5aa9c1a9 100644
--- a/lm_eval/tasks/bigbench/greedy_until/color.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/color.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: color
+dataset_name: color_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_color_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
index 90d183ad..ec0fdc44 100644
--- a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: common_morpheme
+dataset_name: common_morpheme_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_common_morpheme_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
index 007649a4..5eaba446 100644
--- a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: conceptual_combinations
+dataset_name: conceptual_combinations_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_conceptual_combinations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
index 3b5bafac..afae8184 100644
--- a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: conlang_translation
+dataset_name: conlang_translation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_conlang_translation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
index dc594b9b..bb7eba64 100644
--- a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: contextual_parametric_knowledge_conflicts
+dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_contextual_parametric_knowledge_conflicts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
index aca19b7b..ae7f6b9f 100644
--- a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: crash_blossom
+dataset_name: crash_blossom_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_crash_blossom_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
index 043e8f47..7d56bbc2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: crass_ai
+dataset_name: crass_ai_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_crass_ai_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
index eb9c5b3b..37fd99ad 100644
--- a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cryobiology_spanish
+dataset_name: cryobiology_spanish_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_cryobiology_spanish_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
index 15c181b2..64577738 100644
--- a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cryptonite
+dataset_name: cryptonite_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_cryptonite_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
index 477c2497..9279c295 100644
--- a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cs_algorithms
+dataset_name: cs_algorithms_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_cs_algorithms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
index 0521848d..014d57e6 100644
--- a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: dark_humor_detection
+dataset_name: dark_humor_detection_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_dark_humor_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
index 5936e98f..999a7e71 100644
--- a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: date_understanding
+dataset_name: date_understanding_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_date_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
index ffe5135a..db25589d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: disambiguation_qa
+dataset_name: disambiguation_qa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_disambiguation_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
index aed41150..ae8941e8 100644
--- a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: discourse_marker_prediction
+dataset_name: discourse_marker_prediction_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_discourse_marker_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
index fd087719..0086850a 100644
--- a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: disfl_qa
+dataset_name: disfl_qa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_disfl_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
index af29b7f8..e8de0093 100644
--- a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: dyck_languages
+dataset_name: dyck_languages_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_dyck_languages_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
index ea1a61ba..55369151 100644
--- a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: elementary_math_qa
+dataset_name: elementary_math_qa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_elementary_math_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
index a8368f62..4553ede7 100644
--- a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: emoji_movie
+dataset_name: emoji_movie_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_emoji_movie_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
index f8392d66..e570e24a 100644
--- a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: emojis_emotion_prediction
+dataset_name: emojis_emotion_prediction_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_emojis_emotion_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
index 97ea08c8..d4f2f3cf 100644
--- a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: empirical_judgments
+dataset_name: empirical_judgments_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_empirical_judgments_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
index 2eaa4a9b..b7628796 100644
--- a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: english_proverbs
+dataset_name: english_proverbs_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
index d0386e50..ea719e1d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: english_russian_proverbs
+dataset_name: english_russian_proverbs_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_english_russian_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
index efb1f853..e3d89fc2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: entailed_polarity
+dataset_name: entailed_polarity_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_entailed_polarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
index 5922a065..e416a059 100644
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: entailed_polarity_hindi
+dataset_name: entailed_polarity_hindi_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_entailed_polarity_hindi_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
index d6307592..8f8efc4e 100644
--- a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: epistemic_reasoning
+dataset_name: epistemic_reasoning_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_epistemic_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
index 13b6dd32..b35240c4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: evaluating_information_essentiality
+dataset_name: evaluating_information_essentiality_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_evaluating_information_essentiality_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
index 6d3ccf9f..f83e4081 100644
--- a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: fact_checker
+dataset_name: fact_checker_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_fact_checker_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
index 16415a7b..ab38359d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: fantasy_reasoning
+dataset_name: fantasy_reasoning_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_fantasy_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
index 229e1c70..bf1e33e0 100644
--- a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: few_shot_nlg
+dataset_name: few_shot_nlg_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_few_shot_nlg_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
index 059f9f33..184cd4e6 100644
--- a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: figure_of_speech_detection
+dataset_name: figure_of_speech_detection_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_figure_of_speech_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
index 663a718c..cb1915b8 100644
--- a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: formal_fallacies_syllogisms_negation
+dataset_name: formal_fallacies_syllogisms_negation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_formal_fallacies_syllogisms_negation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gem.yaml b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
index 79492583..aa43ca45 100644
--- a/lm_eval/tasks/bigbench/greedy_until/gem.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gem
+dataset_name: gem_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_gem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
index 10414179..6471e577 100644
--- a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gender_inclusive_sentences_german
+dataset_name: gender_inclusive_sentences_german_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_gender_inclusive_sentences_german_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
index b2a14656..93a3f875 100644
--- a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: general_knowledge
+dataset_name: general_knowledge_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_general_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
index 4e256462..c3a5d9a7 100644
--- a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: geometric_shapes
+dataset_name: geometric_shapes_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_geometric_shapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
index d865e3d4..6fd557d3 100644
--- a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: goal_step_wikihow
+dataset_name: goal_step_wikihow_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_goal_step_wikihow_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
index 9f044835..c4416b10 100644
--- a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gre_reading_comprehension
+dataset_name: gre_reading_comprehension_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_gre_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
index 1ab62b56..4060824c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hhh_alignment
+dataset_name: hhh_alignment_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_hhh_alignment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
index 3a0fa8b2..5c4791b4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hindi_question_answering
+dataset_name: hindi_question_answering_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_hindi_question_answering_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
index 19162629..040441f7 100644
--- a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hindu_knowledge
+dataset_name: hindu_knowledge_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_hindu_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
index 84073aa0..0eb98e51 100644
--- a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hinglish_toxicity
+dataset_name: hinglish_toxicity_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_hinglish_toxicity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
index 32fc0058..c5541571 100644
--- a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: human_organs_senses
+dataset_name: human_organs_senses_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_human_organs_senses_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
index d3a65a87..4368f4c9 100644
--- a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hyperbaton
+dataset_name: hyperbaton_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_hyperbaton_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
index 616085c8..2c08703e 100644
--- a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: identify_math_theorems
+dataset_name: identify_math_theorems_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_identify_math_theorems_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
index 6500f7a9..9cb39d0d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: identify_odd_metaphor
+dataset_name: identify_odd_metaphor_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_identify_odd_metaphor_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
index fdc133f5..e216762c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: implicatures
+dataset_name: implicatures_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_implicatures_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
index b05af0ad..c7a82a10 100644
--- a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: implicit_relations
+dataset_name: implicit_relations_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_implicit_relations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
index 37769770..4839afa2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: intent_recognition
+dataset_name: intent_recognition_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_intent_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
index 81b975c9..62643a46 100644
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: international_phonetic_alphabet_nli
+dataset_name: international_phonetic_alphabet_nli_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_international_phonetic_alphabet_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
index ac664332..05feb4f5 100644
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: international_phonetic_alphabet_transliterate
+dataset_name: international_phonetic_alphabet_transliterate_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_international_phonetic_alphabet_transliterate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
index d08f1d6a..57745d23 100644
--- a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: intersect_geometry
+dataset_name: intersect_geometry_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_intersect_geometry_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
index d9d5961c..b49dfb44 100644
--- a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: irony_identification
+dataset_name: irony_identification_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_irony_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
index b6a7470f..293ff6c2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: kanji_ascii
+dataset_name: kanji_ascii_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_kanji_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
index 50ad13c1..00eeb32a 100644
--- a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: kannada
+dataset_name: kannada_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_kannada_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
index 6d5ad040..d313e1ce 100644
--- a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: key_value_maps
+dataset_name: key_value_maps_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_key_value_maps_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
index c07e0e8c..d72e1d37 100644
--- a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: known_unknowns
+dataset_name: known_unknowns_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_known_unknowns_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
index 392a7190..61e85b53 100644
--- a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: language_games
+dataset_name: language_games_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_language_games_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
index 583d9108..8db65637 100644
--- a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: language_identification
+dataset_name: language_identification_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_language_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
index 92a855a8..db6e9832 100644
--- a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: linguistic_mappings
+dataset_name: linguistic_mappings_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_linguistic_mappings_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
index 7aec6607..4e3981f4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: linguistics_puzzles
+dataset_name: linguistics_puzzles_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_linguistics_puzzles_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
index f7f0d436..32afff69 100644
--- a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: list_functions
+dataset_name: list_functions_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_list_functions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
index 2699b12f..a1d1b5b1 100644
--- a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logic_grid_puzzle
+dataset_name: logic_grid_puzzle_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_logic_grid_puzzle_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
index 9a263f96..201c04ae 100644
--- a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_args
+dataset_name: logical_args_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_logical_args_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
index 5e72facb..1b77561d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_deduction
+dataset_name: logical_deduction_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_logical_deduction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
index a21fbc58..af3e9ea4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_fallacy_detection
+dataset_name: logical_fallacy_detection_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_logical_fallacy_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
index f01ce277..4d4ffe1d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_sequence
+dataset_name: logical_sequence_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_logical_sequence_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
index d4b2fcf6..84d0f419 100644
--- a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mathematical_induction
+dataset_name: mathematical_induction_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_mathematical_induction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
index adf86ecc..956aa5f0 100644
--- a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: matrixshapes
+dataset_name: matrixshapes_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_matrixshapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
index 94893b4d..7fd4e53c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: metaphor_boolean
+dataset_name: metaphor_boolean_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_metaphor_boolean_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
index 8ca4da75..12b79d44 100644
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: metaphor_understanding
+dataset_name: metaphor_understanding_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_metaphor_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
index b9db2b8f..459aec57 100644
--- a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: minute_mysteries_qa
+dataset_name: minute_mysteries_qa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_minute_mysteries_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
index 60c8221b..25038ae3 100644
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: misconceptions
+dataset_name: misconceptions_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_misconceptions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
index a1fca685..676d94ea 100644
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: misconceptions_russian
+dataset_name: misconceptions_russian_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_misconceptions_russian_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
index b845caa3..19c9a82b 100644
--- a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mnist_ascii
+dataset_name: mnist_ascii_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_mnist_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
index 5dc888f6..313b5b9d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: modified_arithmetic
+dataset_name: modified_arithmetic_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_modified_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
index a20c23be..f478ed24 100644
--- a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: moral_permissibility
+dataset_name: moral_permissibility_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_moral_permissibility_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
index db57a939..98e06e5d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: movie_dialog_same_or_different
+dataset_name: movie_dialog_same_or_different_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_movie_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
index 00a0c1a4..7cd021a4 100644
--- a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: movie_recommendation
+dataset_name: movie_recommendation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_movie_recommendation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
index 7a1003cf..92b84838 100644
--- a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mult_data_wrangling
+dataset_name: mult_data_wrangling_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_mult_data_wrangling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
index df230d77..ac4f9432 100644
--- a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: multiemo
+dataset_name: multiemo_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_multiemo_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
index cc800106..0b87004d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: natural_instructions
+dataset_name: natural_instructions_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_natural_instructions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
index 1e3004d5..85fd618b 100644
--- a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: navigate
+dataset_name: navigate_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_navigate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
index 169b6743..863b0a85 100644
--- a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: nonsense_words_grammar
+dataset_name: nonsense_words_grammar_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_nonsense_words_grammar_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
index 9618dce2..b3b08806 100644
--- a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: novel_concepts
+dataset_name: novel_concepts_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_novel_concepts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
index 7b058748..fc0d6119 100644
--- a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: object_counting
+dataset_name: object_counting_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_object_counting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
index 1742789e..90d0fd93 100644
--- a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: odd_one_out
+dataset_name: odd_one_out_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_odd_one_out_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/operators.yaml b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
index d71d87c2..d4ad9b91 100644
--- a/lm_eval/tasks/bigbench/greedy_until/operators.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: operators
+dataset_name: operators_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_operators_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
index 13d8fb9d..c661e1a7 100644
--- a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: paragraph_segmentation
+dataset_name: paragraph_segmentation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_paragraph_segmentation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
index f8b78f8d..4ea51e21 100644
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: parsinlu_qa
+dataset_name: parsinlu_qa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_parsinlu_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
index 4db292d0..967741fd 100644
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: parsinlu_reading_comprehension
+dataset_name: parsinlu_reading_comprehension_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_parsinlu_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
index a282fa64..5e59b741 100644
--- a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: penguins_in_a_table
+dataset_name: penguins_in_a_table_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_penguins_in_a_table_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
index 458a2e3d..a7ed5a82 100644
--- a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: periodic_elements
+dataset_name: periodic_elements_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_periodic_elements_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
index e51eb69a..087d4688 100644
--- a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: persian_idioms
+dataset_name: persian_idioms_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_persian_idioms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
index 3b03a67f..c2da5cce 100644
--- a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: phrase_relatedness
+dataset_name: phrase_relatedness_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_phrase_relatedness_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
index 358b7db3..1482fe65 100644
--- a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physical_intuition
+dataset_name: physical_intuition_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_physical_intuition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics.yaml b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
index d9d6f936..7fade7b3 100644
--- a/lm_eval/tasks/bigbench/greedy_until/physics.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physics
+dataset_name: physics_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_physics_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
index 6af11448..bf332361 100644
--- a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physics_questions
+dataset_name: physics_questions_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_physics_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
index 600143e9..1ddf7ca7 100644
--- a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: play_dialog_same_or_different
+dataset_name: play_dialog_same_or_different_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_play_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
index 432820ad..10c8bd98 100644
--- a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: polish_sequence_labeling
+dataset_name: polish_sequence_labeling_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_polish_sequence_labeling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
index c492b17f..66d0e5ea 100644
--- a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: presuppositions_as_nli
+dataset_name: presuppositions_as_nli_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_presuppositions_as_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
index a23ea6e7..67240110 100644
--- a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: qa_wikidata
+dataset_name: qa_wikidata_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_qa_wikidata_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
index 47953c14..5652cb3f 100644
--- a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: question_selection
+dataset_name: question_selection_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_question_selection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
index e15af76e..c206597b 100644
--- a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: real_or_fake_text
+dataset_name: real_or_fake_text_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_real_or_fake_text_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
index b1aa5ec0..8b1051e5 100644
--- a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: reasoning_about_colored_objects
+dataset_name: reasoning_about_colored_objects_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_reasoning_about_colored_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
index 12831cc7..279ecd01 100644
--- a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: repeat_copy_logic
+dataset_name: repeat_copy_logic_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_repeat_copy_logic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
index 78c6bda7..90135638 100644
--- a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: rephrase
+dataset_name: rephrase_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_rephrase_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
index e93b4aa9..a11c167d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: riddle_sense
+dataset_name: riddle_sense_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_riddle_sense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
index 46039e1f..5074e010 100644
--- a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: ruin_names
+dataset_name: ruin_names_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_ruin_names_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
index a7e5c77e..7f2ce433 100644
--- a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: salient_translation_error_detection
+dataset_name: salient_translation_error_detection_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_salient_translation_error_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
index aa35e659..90071882 100644
--- a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: scientific_press_release
+dataset_name: scientific_press_release_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_scientific_press_release_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
index 184bfcb9..93ddccc2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: semantic_parsing_in_context_sparc
+dataset_name: semantic_parsing_in_context_sparc_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_semantic_parsing_in_context_sparc_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
index ae0b9461..cc590faf 100644
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: semantic_parsing_spider
+dataset_name: semantic_parsing_spider_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_semantic_parsing_spider_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
index bb72ec88..6cbacb79 100644
--- a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sentence_ambiguity
+dataset_name: sentence_ambiguity_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_sentence_ambiguity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
index 5c1ef27f..10e9a439 100644
--- a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: similarities_abstraction
+dataset_name: similarities_abstraction_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_similarities_abstraction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
index 742df0fb..a82b8226 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simp_turing_concept
+dataset_name: simp_turing_concept_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simp_turing_concept_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
index 4e70a160..8e0a207e 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json
+dataset_name: simple_arithmetic_json_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_arithmetic_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
index 5f6b6732..df235325 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json_multiple_choice
+dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_arithmetic_json_multiple_choice_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
index 32c5fcfd..2f981fb0 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json_subtasks
+dataset_name: simple_arithmetic_json_subtasks_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_arithmetic_json_subtasks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
index 0d87803e..2bc6cf16 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_multiple_targets_json
+dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_arithmetic_multiple_targets_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
index 2332985c..77e45a58 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_ethical_questions
+dataset_name: simple_ethical_questions_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_ethical_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
index 1d9943e4..1b485d5c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_text_editing
+dataset_name: simple_text_editing_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_simple_text_editing_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
index e98308e1..9ccbda74 100644
--- a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: snarks
+dataset_name: snarks_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_snarks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
index 7af09b30..9cbc5ec5 100644
--- a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: social_iqa
+dataset_name: social_iqa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_social_iqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
index 8e34e758..bcc3a9d1 100644
--- a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: social_support
+dataset_name: social_support_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_social_support_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
index 3ae80c24..01082a10 100644
--- a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sports_understanding
+dataset_name: sports_understanding_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_sports_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
index b6020b08..a0bf1c46 100644
--- a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: strange_stories
+dataset_name: strange_stories_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_strange_stories_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
index 066c89d1..495d873f 100644
--- a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: strategyqa
+dataset_name: strategyqa_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_strategyqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
index 27ef04dd..3484952c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sufficient_information
+dataset_name: sufficient_information_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_sufficient_information_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
index a7496025..a8e980d5 100644
--- a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: suicide_risk
+dataset_name: suicide_risk_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_suicide_risk_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
index 25e7dfe6..ff045534 100644
--- a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: swahili_english_proverbs
+dataset_name: swahili_english_proverbs_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_swahili_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
index 1beebb17..8cbd401b 100644
--- a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: swedish_to_german_proverbs
+dataset_name: swedish_to_german_proverbs_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_swedish_to_german_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
index 27b29a05..3fa4cdba 100644
--- a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: symbol_interpretation
+dataset_name: symbol_interpretation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_symbol_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
index 6ed42414..c20300f8 100644
--- a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: temporal_sequences
+dataset_name: temporal_sequences_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_temporal_sequences_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tense.yaml b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
index 49adc7c2..b1b5698d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/tense.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: tense
+dataset_name: tense_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_tense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
index 391dff43..d5f1950e 100644
--- a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: timedial
+dataset_name: timedial_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_timedial_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
index f9f1893f..4ec83039 100644
--- a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: topical_chat
+dataset_name: topical_chat_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_topical_chat_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
index 675b0e37..27024bee 100644
--- a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: tracking_shuffled_objects
+dataset_name: tracking_shuffled_objects_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_tracking_shuffled_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
index 3c5ff40a..f467652d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: understanding_fables
+dataset_name: understanding_fables_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_understanding_fables_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
index 8e0c0699..d91ff331 100644
--- a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: undo_permutation
+dataset_name: undo_permutation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_undo_permutation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
index 384ccc05..a31929fb 100644
--- a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unit_conversion
+dataset_name: unit_conversion_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_unit_conversion_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
index a33bfd51..ca4c38be 100644
--- a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unit_interpretation
+dataset_name: unit_interpretation_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_unit_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
index cb3d2572..1cc271d2 100644
--- a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unnatural_in_context_learning
+dataset_name: unnatural_in_context_learning_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_unnatural_in_context_learning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
index 67380ab9..770e8500 100644
--- a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: vitaminc_fact_verification
+dataset_name: vitaminc_fact_verification_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_vitaminc_fact_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
index baad0d9e..8c60da65 100644
--- a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: what_is_the_tao
+dataset_name: what_is_the_tao_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_what_is_the_tao_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
index 70047ee7..4eda6d08 100644
--- a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: which_wiki_edit
+dataset_name: which_wiki_edit_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_which_wiki_edit_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
index fff312b3..e065c80c 100644
--- a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: winowhy
+dataset_name: winowhy_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_winowhy_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
index 77b55d77..caa6f02d 100644
--- a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: word_sorting
+dataset_name: word_sorting_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_word_sorting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
index 75fe7de9..774aef15 100644
--- a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: word_unscrambling
+dataset_name: word_unscrambling_zero_shot
 include: ../greedy_until_template_yaml
 task: bigbench_word_unscrambling_greedy_until
diff --git a/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
index e815ad82..34cefc25 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: abstract_narrative_understanding
+dataset_name: abstract_narrative_understanding_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_abstract_narrative_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml b/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
index 0edb33ae..b1e2903c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: anachronisms
+dataset_name: anachronisms_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_anachronisms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml b/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
index 4a63e23a..6e20092e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: analogical_similarity
+dataset_name: analogical_similarity_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_analogical_similarity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml b/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
index 3503337d..9ecf8fb5 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: analytic_entailment
+dataset_name: analytic_entailment_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_analytic_entailment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml b/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
index a7af2d17..9b19b92f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: arithmetic
+dataset_name: arithmetic_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_arithmetic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
index 9eca1362..254f115b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: ascii_word_recognition
+dataset_name: ascii_word_recognition_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_ascii_word_recognition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml b/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
index 0c49e8ee..4caeacd4 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: authorship_verification
+dataset_name: authorship_verification_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_authorship_verification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml b/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
index 108cc802..16e62e69 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: auto_categorization
+dataset_name: auto_categorization_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_auto_categorization_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml b/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
index 7ae0c2a5..72db1d8e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: auto_debugging
+dataset_name: auto_debugging_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_auto_debugging_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
index 6cb2bff4..3c4be304 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: bbq_lite_json
+dataset_name: bbq_lite_json_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_bbq_lite_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
index 33871759..73448ad9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: bridging_anaphora_resolution_barqa
+dataset_name: bridging_anaphora_resolution_barqa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_bridging_anaphora_resolution_barqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml b/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
index 340e9bda..1d09f2d4 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: causal_judgment
+dataset_name: causal_judgment_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_causal_judgment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml b/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
index 4b3dd1a6..c39ec278 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cause_and_effect
+dataset_name: cause_and_effect_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_cause_and_effect_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
index 000360c0..0a9883d0 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: checkmate_in_one
+dataset_name: checkmate_in_one_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_checkmate_in_one_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
index b6d1f2e2..ea299797 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: chess_state_tracking
+dataset_name: chess_state_tracking_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_chess_state_tracking_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
index 2552166c..c24d5761 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: chinese_remainder_theorem
+dataset_name: chinese_remainder_theorem_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_chinese_remainder_theorem_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml b/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
index a03e56c0..f5918e60 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cifar10_classification
+dataset_name: cifar10_classification_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_cifar10_classification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml b/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
index f2a33424..9360f759 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: code_line_description
+dataset_name: code_line_description_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_code_line_description_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml b/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
index c03dc365..5655ea1f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/codenames.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: codenames
+dataset_name: codenames_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_codenames_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/color.yaml b/lm_eval/tasks/bigbench/multiple_choice/color.yaml
index f49710c7..7350013f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/color.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/color.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: color
+dataset_name: color_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_color_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml b/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
index 619c8eea..bf8f3aca 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: common_morpheme
+dataset_name: common_morpheme_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_common_morpheme_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
index a7570bb0..3ee13b37 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: conceptual_combinations
+dataset_name: conceptual_combinations_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_conceptual_combinations_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml b/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
index 4ff6ef02..e5a28097 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/conlang_translation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: conlang_translation
+dataset_name: conlang_translation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_conlang_translation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
index 4b34eec8..3bf9d9bf 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: contextual_parametric_knowledge_conflicts
+dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_contextual_parametric_knowledge_conflicts_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml b/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
index 2b0b9d46..4aca69ad 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: crash_blossom
+dataset_name: crash_blossom_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_crash_blossom_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml b/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
index c203459a..ac7c1820 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: crass_ai
+dataset_name: crass_ai_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_crass_ai_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
index c8cdd625..c187505d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cryobiology_spanish
+dataset_name: cryobiology_spanish_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_cryobiology_spanish_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml b/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
index 503cd601..c5e0519f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cryptonite.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cryptonite
+dataset_name: cryptonite_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_cryptonite_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml b/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
index bb9d90b0..0b8e694c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: cs_algorithms
+dataset_name: cs_algorithms_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_cs_algorithms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
index cb00651a..3a77ea44 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: dark_humor_detection
+dataset_name: dark_humor_detection_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_dark_humor_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
index 596a941e..2851f0bb 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: date_understanding
+dataset_name: date_understanding_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_date_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
index 5264c21f..2827232a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: disambiguation_qa
+dataset_name: disambiguation_qa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_disambiguation_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
index 151616c2..5a18733f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: discourse_marker_prediction
+dataset_name: discourse_marker_prediction_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_discourse_marker_prediction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
index 578df2a3..bf8494cf 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/disfl_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: disfl_qa
+dataset_name: disfl_qa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_disfl_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml b/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
index 07ecf4d4..48d6f32e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: dyck_languages
+dataset_name: dyck_languages_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_dyck_languages_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
index d9e41204..64cb58ff 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: elementary_math_qa
+dataset_name: elementary_math_qa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_elementary_math_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml b/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
index f6528de7..0604d97d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: emoji_movie
+dataset_name: emoji_movie_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_emoji_movie_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
index cedbd41c..ff648d9c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: emojis_emotion_prediction
+dataset_name: emojis_emotion_prediction_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_emojis_emotion_prediction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml b/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
index 078a3c45..c848740b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: empirical_judgments
+dataset_name: empirical_judgments_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_empirical_judgments_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
index 0dd3a6c6..8adc12e9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: english_proverbs
+dataset_name: english_proverbs_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_english_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
index 12c7dae6..ed26147a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: english_russian_proverbs
+dataset_name: english_russian_proverbs_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_english_russian_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
index 336a013e..24444e55 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: entailed_polarity
+dataset_name: entailed_polarity_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_entailed_polarity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
index 4d1bf0e8..32878c8b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: entailed_polarity_hindi
+dataset_name: entailed_polarity_hindi_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_entailed_polarity_hindi_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
index 79827577..2c35581a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: epistemic_reasoning
+dataset_name: epistemic_reasoning_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_epistemic_reasoning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
index f82cd899..b85acd95 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: evaluating_information_essentiality
+dataset_name: evaluating_information_essentiality_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_evaluating_information_essentiality_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml b/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
index 2e20aabe..4fbed803 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: fact_checker
+dataset_name: fact_checker_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_fact_checker_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
index e7931f2f..68a55e47 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: fantasy_reasoning
+dataset_name: fantasy_reasoning_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_fantasy_reasoning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
index 593c4860..39fcd9cf 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/few_shot_nlg.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: few_shot_nlg
+dataset_name: few_shot_nlg_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_few_shot_nlg_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
index 00f07670..68a83956 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: figure_of_speech_detection
+dataset_name: figure_of_speech_detection_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_figure_of_speech_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
index b2eb5aca..7ff37fd7 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: formal_fallacies_syllogisms_negation
+dataset_name: formal_fallacies_syllogisms_negation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_formal_fallacies_syllogisms_negation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gem.yaml b/lm_eval/tasks/bigbench/multiple_choice/gem.yaml
index 5fd4caae..bf81e880 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/gem.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/gem.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gem
+dataset_name: gem_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_gem_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
index 77d16864..39eee21a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/gender_inclusive_sentences_german.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gender_inclusive_sentences_german
+dataset_name: gender_inclusive_sentences_german_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_gender_inclusive_sentences_german_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml b/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
index 021ad284..8083b869 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: general_knowledge
+dataset_name: general_knowledge_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_general_knowledge_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml b/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
index cfc2ada2..7b80acbf 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: geometric_shapes
+dataset_name: geometric_shapes_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_geometric_shapes_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
index e457887f..6413fb03 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: goal_step_wikihow
+dataset_name: goal_step_wikihow_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_goal_step_wikihow_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
index 8ec630d5..53523c33 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: gre_reading_comprehension
+dataset_name: gre_reading_comprehension_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_gre_reading_comprehension_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml b/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
index 94272e8a..c5e4f24a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hhh_alignment
+dataset_name: hhh_alignment_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_hhh_alignment_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
index 0ab2cecd..ed1ed278 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/hindi_question_answering.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hindi_question_answering
+dataset_name: hindi_question_answering_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_hindi_question_answering_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
index 2d49951b..321f7513 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hindu_knowledge
+dataset_name: hindu_knowledge_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_hindu_knowledge_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
index 4c7ca8bd..5dac090f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hinglish_toxicity
+dataset_name: hinglish_toxicity_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_hinglish_toxicity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml b/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
index d04bccc9..2fef6d93 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: human_organs_senses
+dataset_name: human_organs_senses_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_human_organs_senses_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml b/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
index 9e15ffac..34b37710 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: hyperbaton
+dataset_name: hyperbaton_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_hyperbaton_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
index dfb75722..f716129d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: identify_math_theorems
+dataset_name: identify_math_theorems_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_identify_math_theorems_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
index 3657d3b0..93c4c244 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: identify_odd_metaphor
+dataset_name: identify_odd_metaphor_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_identify_odd_metaphor_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml b/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
index 8c2d4c81..9a26fd55 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: implicatures
+dataset_name: implicatures_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_implicatures_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml b/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
index a837cdf3..9bb08442 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: implicit_relations
+dataset_name: implicit_relations_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_implicit_relations_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml b/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
index 9d9cb82b..720ac92a 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: intent_recognition
+dataset_name: intent_recognition_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_intent_recognition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
index 715582af..89d7742d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: international_phonetic_alphabet_nli
+dataset_name: international_phonetic_alphabet_nli_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_international_phonetic_alphabet_nli_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
index cd6f6f71..c8e866e2 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_transliterate.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: international_phonetic_alphabet_transliterate
+dataset_name: international_phonetic_alphabet_transliterate_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_international_phonetic_alphabet_transliterate_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml b/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
index d6448572..6014a175 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: intersect_geometry
+dataset_name: intersect_geometry_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_intersect_geometry_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml b/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
index bb8385f2..a19ff99e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: irony_identification
+dataset_name: irony_identification_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_irony_identification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml b/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
index 7ba101c8..a90a8286 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: kanji_ascii
+dataset_name: kanji_ascii_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_kanji_ascii_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml b/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
index e3767b21..910cec47 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/kannada.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: kannada
+dataset_name: kannada_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_kannada_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml b/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
index 88c6bf5e..75a673c8 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: key_value_maps
+dataset_name: key_value_maps_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_key_value_maps_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml b/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
index de972c64..1c5f6293 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: known_unknowns
+dataset_name: known_unknowns_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_known_unknowns_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml b/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
index 3e17fd8f..07e2711b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/language_games.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: language_games
+dataset_name: language_games_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_language_games_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml b/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
index e17cdc69..9ea141fb 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: language_identification
+dataset_name: language_identification_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_language_identification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
index 118de388..50800d9d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/linguistic_mappings.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: linguistic_mappings
+dataset_name: linguistic_mappings_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_linguistic_mappings_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
index 4799e672..e269cd04 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/linguistics_puzzles.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: linguistics_puzzles
+dataset_name: linguistics_puzzles_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_linguistics_puzzles_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml b/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
index f2c94ada..4f4f2ca1 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/list_functions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: list_functions
+dataset_name: list_functions_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_list_functions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
index c24e71ac..da6a018f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logic_grid_puzzle
+dataset_name: logic_grid_puzzle_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_logic_grid_puzzle_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
index 11e2771e..84f55f64 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_args
+dataset_name: logical_args_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_logical_args_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
index 0de47251..592d2afa 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_deduction
+dataset_name: logical_deduction_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_logical_deduction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
index b4d68c1b..1c6411af 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_fallacy_detection
+dataset_name: logical_fallacy_detection_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_logical_fallacy_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml b/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
index e58224b9..65671894 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: logical_sequence
+dataset_name: logical_sequence_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_logical_sequence_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml b/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
index 316b8eed..4ed0ad3c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mathematical_induction
+dataset_name: mathematical_induction_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_mathematical_induction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml b/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
index ebbc32f5..9facf639 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/matrixshapes.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: matrixshapes
+dataset_name: matrixshapes_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_matrixshapes_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
index 2bbe0c00..7c476c4e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: metaphor_boolean
+dataset_name: metaphor_boolean_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_metaphor_boolean_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
index ae0fab49..6661a54f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: metaphor_understanding
+dataset_name: metaphor_understanding_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_metaphor_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
index 76b1bac0..67109c8c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/minute_mysteries_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: minute_mysteries_qa
+dataset_name: minute_mysteries_qa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_minute_mysteries_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml b/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
index dce2a5c2..63d0fcda 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: misconceptions
+dataset_name: misconceptions_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_misconceptions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
index fca2b324..f9c5db38 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: misconceptions_russian
+dataset_name: misconceptions_russian_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_misconceptions_russian_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml b/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
index ac32701f..a1b091da 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mnist_ascii
+dataset_name: mnist_ascii_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_mnist_ascii_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
index fd5c271a..c8a23735 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/modified_arithmetic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: modified_arithmetic
+dataset_name: modified_arithmetic_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_modified_arithmetic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml b/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
index 95414745..38295552 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: moral_permissibility
+dataset_name: moral_permissibility_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_moral_permissibility_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
index 831b261a..89b93d9d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: movie_dialog_same_or_different
+dataset_name: movie_dialog_same_or_different_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_movie_dialog_same_or_different_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml b/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
index 16d4ea55..7055028e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: movie_recommendation
+dataset_name: movie_recommendation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_movie_recommendation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
index b7693b06..17b67bcc 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/mult_data_wrangling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: mult_data_wrangling
+dataset_name: mult_data_wrangling_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_mult_data_wrangling_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml b/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
index 8c954b5d..10ff48ea 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: multiemo
+dataset_name: multiemo_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_multiemo_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml b/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
index 78d295c5..4874dd15 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/natural_instructions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: natural_instructions
+dataset_name: natural_instructions_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_natural_instructions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml b/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
index 3bc9f120..e69f2790 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/navigate.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: navigate
+dataset_name: navigate_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_navigate_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
index 7a7b2d80..52d25bca 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: nonsense_words_grammar
+dataset_name: nonsense_words_grammar_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_nonsense_words_grammar_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml b/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
index 04172c1a..3fc74aa9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: novel_concepts
+dataset_name: novel_concepts_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_novel_concepts_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml b/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
index c6ab4011..277d843d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/object_counting.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: object_counting
+dataset_name: object_counting_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_object_counting_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml b/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
index 82d70a63..aaa43e67 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: odd_one_out
+dataset_name: odd_one_out_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_odd_one_out_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/operators.yaml b/lm_eval/tasks/bigbench/multiple_choice/operators.yaml
index e48c8005..951db6f9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/operators.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/operators.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: operators
+dataset_name: operators_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_operators_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
index 3423101a..2cfc8283 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/paragraph_segmentation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: paragraph_segmentation
+dataset_name: paragraph_segmentation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_paragraph_segmentation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
index a2f65cde..7a9b61fb 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: parsinlu_qa
+dataset_name: parsinlu_qa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_parsinlu_qa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
index 3f0f6182..5fa0eccc 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/parsinlu_reading_comprehension.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: parsinlu_reading_comprehension
+dataset_name: parsinlu_reading_comprehension_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_parsinlu_reading_comprehension_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
index ed4945f9..de024e2e 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: penguins_in_a_table
+dataset_name: penguins_in_a_table_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_penguins_in_a_table_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml b/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
index 5adb9422..b7a644f9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: periodic_elements
+dataset_name: periodic_elements_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_periodic_elements_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml b/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
index c0ee240f..6fa92ed3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: persian_idioms
+dataset_name: persian_idioms_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_persian_idioms_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
index 6231e5c0..c797aec6 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: phrase_relatedness
+dataset_name: phrase_relatedness_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_phrase_relatedness_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml b/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
index 50353ac7..089376dd 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physical_intuition
+dataset_name: physical_intuition_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_physical_intuition_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physics.yaml b/lm_eval/tasks/bigbench/multiple_choice/physics.yaml
index f3b4244e..bc06f79d 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/physics.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/physics.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physics
+dataset_name: physics_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_physics_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml b/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
index ec5e9531..44646f14 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/physics_questions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: physics_questions
+dataset_name: physics_questions_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_physics_questions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
index a81f33b0..85aac7f4 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: play_dialog_same_or_different
+dataset_name: play_dialog_same_or_different_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_play_dialog_same_or_different_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
index af82fce2..d61345fe 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/polish_sequence_labeling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: polish_sequence_labeling
+dataset_name: polish_sequence_labeling_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_polish_sequence_labeling_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
index 83b733a3..71a56aa8 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: presuppositions_as_nli
+dataset_name: presuppositions_as_nli_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_presuppositions_as_nli_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml b/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
index 5f52b44c..263d61eb 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/qa_wikidata.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: qa_wikidata
+dataset_name: qa_wikidata_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_qa_wikidata_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml b/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
index 1b4301bb..3b3dd0d7 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: question_selection
+dataset_name: question_selection_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_question_selection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
index d41cd5dd..8138791f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: real_or_fake_text
+dataset_name: real_or_fake_text_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_real_or_fake_text_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
index e5e6f520..3ab6d5e0 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: reasoning_about_colored_objects
+dataset_name: reasoning_about_colored_objects_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_reasoning_about_colored_objects_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
index 73406e70..666aa49b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/repeat_copy_logic.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: repeat_copy_logic
+dataset_name: repeat_copy_logic_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_repeat_copy_logic_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml b/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
index b785712c..49e3cb4b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/rephrase.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: rephrase
+dataset_name: rephrase_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_rephrase_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml b/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
index e8aff5b3..93434e2c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: riddle_sense
+dataset_name: riddle_sense_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_riddle_sense_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml b/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
index 7504f388..32c38ba3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: ruin_names
+dataset_name: ruin_names_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_ruin_names_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
index a462eb15..d930e741 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: salient_translation_error_detection
+dataset_name: salient_translation_error_detection_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_salient_translation_error_detection_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml b/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
index 5ea881cd..f23190e7 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/scientific_press_release.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: scientific_press_release
+dataset_name: scientific_press_release_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_scientific_press_release_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
index 886b61be..00574b2f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_in_context_sparc.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: semantic_parsing_in_context_sparc
+dataset_name: semantic_parsing_in_context_sparc_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_semantic_parsing_in_context_sparc_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
index cb5dc922..a988e54c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/semantic_parsing_spider.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: semantic_parsing_spider
+dataset_name: semantic_parsing_spider_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_semantic_parsing_spider_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
index 573f6199..4e4a18f1 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sentence_ambiguity
+dataset_name: sentence_ambiguity_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_sentence_ambiguity_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
index 1e0c4ffb..82b86d1b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: similarities_abstraction
+dataset_name: similarities_abstraction_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_similarities_abstraction_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
index 2e453821..7b1849d5 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simp_turing_concept.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simp_turing_concept
+dataset_name: simp_turing_concept_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simp_turing_concept_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
index e5e24f58..cd1b61b9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json
+dataset_name: simple_arithmetic_json_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_arithmetic_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
index 4fb67ac5..4e63fce9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_multiple_choice.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json_multiple_choice
+dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_arithmetic_json_multiple_choice_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
index 67853d68..8688512b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_json_subtasks.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_json_subtasks
+dataset_name: simple_arithmetic_json_subtasks_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_arithmetic_json_subtasks_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
index b76bfbde..685ec17c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_arithmetic_multiple_targets_json.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_arithmetic_multiple_targets_json
+dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_arithmetic_multiple_targets_json_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
index a8a10ca6..0983381b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_ethical_questions
+dataset_name: simple_ethical_questions_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_ethical_questions_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml b/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
index 3bbecfb9..13b67888 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/simple_text_editing.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: simple_text_editing
+dataset_name: simple_text_editing_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_simple_text_editing_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml b/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
index 4e0b9d3a..3e79f1ce 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/snarks.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: snarks
+dataset_name: snarks_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_snarks_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
index de12bcbd..a4da50c9 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: social_iqa
+dataset_name: social_iqa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_social_iqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml b/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
index f2e8c795..1b3bd593 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/social_support.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: social_support
+dataset_name: social_support_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_social_support_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml b/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
index 4a3914a4..e5a123fc 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sports_understanding
+dataset_name: sports_understanding_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_sports_understanding_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml b/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
index f0882aa2..30877750 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: strange_stories
+dataset_name: strange_stories_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_strange_stories_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml b/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
index e99618c0..f988071b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: strategyqa
+dataset_name: strategyqa_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_strategyqa_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml b/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
index 56af1ae2..f53d677c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/sufficient_information.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: sufficient_information
+dataset_name: sufficient_information_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_sufficient_information_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml b/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
index 5c6f0cd2..ecf7465f 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: suicide_risk
+dataset_name: suicide_risk_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_suicide_risk_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
index 497980ae..40103274 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: swahili_english_proverbs
+dataset_name: swahili_english_proverbs_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_swahili_english_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
index 46d49ddc..d2f31d3c 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: swedish_to_german_proverbs
+dataset_name: swedish_to_german_proverbs_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_swedish_to_german_proverbs_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
index a6032ad9..98e3d5b3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: symbol_interpretation
+dataset_name: symbol_interpretation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_symbol_interpretation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml b/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
index 4a63b2ac..abd8834b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: temporal_sequences
+dataset_name: temporal_sequences_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_temporal_sequences_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/tense.yaml b/lm_eval/tasks/bigbench/multiple_choice/tense.yaml
index 4fce296d..6a2676f0 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/tense.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/tense.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: tense
+dataset_name: tense_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_tense_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml b/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
index 550d1190..350d4e78 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/timedial.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: timedial
+dataset_name: timedial_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_timedial_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml b/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
index 232dc706..b9a03639 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/topical_chat.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: topical_chat
+dataset_name: topical_chat_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_topical_chat_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
index 8dd68282..f9aa366b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: tracking_shuffled_objects
+dataset_name: tracking_shuffled_objects_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_tracking_shuffled_objects_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml b/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
index d85d63b1..263793af 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: understanding_fables
+dataset_name: understanding_fables_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_understanding_fables_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml b/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
index 0e92a41f..f7e1feb0 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: undo_permutation
+dataset_name: undo_permutation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_undo_permutation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml b/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
index b4d421e2..21a67c43 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unit_conversion
+dataset_name: unit_conversion_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_unit_conversion_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml b/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
index eb60bc42..68614cfd 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unit_interpretation
+dataset_name: unit_interpretation_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_unit_interpretation_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
index 47c5b755..45943005 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/unnatural_in_context_learning.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: unnatural_in_context_learning
+dataset_name: unnatural_in_context_learning_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_unnatural_in_context_learning_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
index 3ddb5e69..84305bf3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: vitaminc_fact_verification
+dataset_name: vitaminc_fact_verification_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_vitaminc_fact_verification_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
index dda9a695..7879d166 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: what_is_the_tao
+dataset_name: what_is_the_tao_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_what_is_the_tao_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
index a6a5bbbf..3dbfb030 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: which_wiki_edit
+dataset_name: which_wiki_edit_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_which_wiki_edit_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml b/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
index 0b0a858c..98bc6e4b 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: winowhy
+dataset_name: winowhy_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_winowhy_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml b/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
index c244f547..71e79ae3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: word_sorting
+dataset_name: word_sorting_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_word_sorting_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml b/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
index a993ef33..bbfeb144 100644
--- a/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
@@ -1,4 +1,4 @@
 # Generated by utils.py
-dataset_name: word_unscrambling
+dataset_name: word_unscrambling_zero_shot
 include: ../multiple_choice_template_yaml
 task: bigbench_word_unscrambling_multiple_choice
-- 
GitLab


From fa608798a8c5427a4d08aae03a7c8fa42ccba48f Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 3 Oct 2023 20:13:43 +0000
Subject: [PATCH 068/212] add bigbench push_to_hub script

---
 .../tasks/bigbench/push_bigbench_dataset.py   | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 lm_eval/tasks/bigbench/push_bigbench_dataset.py

diff --git a/lm_eval/tasks/bigbench/push_bigbench_dataset.py b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
new file mode 100644
index 00000000..93c0b6c0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -0,0 +1,32 @@
+"""
+A utility script that pushes all Bigbench subtasks from their form in the `bigbench` HF dataset
+into `{org name}/bigbench`.
+
+Prior to running, must log into HF Hub for the target HF hub org via `huggingface-cli login`.
+
+Requires the installation of 
+`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
+and is included so that the bigbench dependency can be avoided.
+"""
+from tqdm import tqdm
+import datasets
+
+import bigbench.api.util as bb_utils
+
+
+all_task_names = bb_utils.get_all_json_task_names()
+
+num_shots = [0]
+
+for shots in num_shots:
+    for task_name in tqdm(all_task_names[29:]):
+        try: 
+            print(f"Loading '{task_name}' with num_shots={shots}...")
+            task_ds = datasets.load_dataset("bigbench", name=task_name, num_shots=shots)
+
+            print(f"Pushing '{task_name}' with num_shots={shots}...")
+            task_ds.push_to_hub("hails/bigbench", task_name + "_zero_shot")
+
+            del task_ds
+        except Exception as e:
+            raise e
\ No newline at end of file
-- 
GitLab


From 67c0f73a9493282c8bda3f9b8a053bc775cd2b30 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 3 Oct 2023 20:14:11 +0000
Subject: [PATCH 069/212] add bigbench push_to_hub script

---
 lm_eval/tasks/bigbench/push_bigbench_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/bigbench/push_bigbench_dataset.py b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
index 93c0b6c0..acc37465 100644
--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -19,7 +19,7 @@ all_task_names = bb_utils.get_all_json_task_names()
 num_shots = [0]
 
 for shots in num_shots:
-    for task_name in tqdm(all_task_names[29:]):
+    for task_name in tqdm(all_task_names):
         try: 
             print(f"Loading '{task_name}' with num_shots={shots}...")
             task_ds = datasets.load_dataset("bigbench", name=task_name, num_shots=shots)
-- 
GitLab


From 7e6e3c200a48fa0c586c7ad9006324b0e05267aa Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 00:54:59 +0000
Subject: [PATCH 070/212] modify default yamls

---
 lm_eval/tasks/bigbench/greedy_until_template_yaml    | 3 +++
 lm_eval/tasks/bigbench/multiple_choice_template_yaml | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
index db975306..3de59b2b 100644
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
@@ -1,6 +1,9 @@
 group: bigbench
 dataset_path: hails/bigbench
 output_type: greedy_until
+dataset_kwargs:
+  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
+  # subtask_name: null
 test_split: default
 doc_to_text: inputs
 doc_to_target: "{{targets[0]}}"
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
index 6211f5b3..3a8185e8 100644
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -1,7 +1,7 @@
 group: bigbench
 dataset_path: hails/bigbench
 dataset_kwargs:
-  num_shots: 0 # TODO: num of shots should be controlled through this, not through the typical methods
+  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
   # subtask_name: null
 output_type: multiple_choice
 test_split: default
-- 
GitLab


From 6f92c20da4b980230191bc0aea87dee07b640673 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 4 Oct 2023 06:46:45 +0000
Subject: [PATCH 071/212] moved main to lm_eval/ to fix import of main error

---
 main.py => lm_eval/main.py | 0
 pyproject.toml             | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename main.py => lm_eval/main.py (100%)

diff --git a/main.py b/lm_eval/main.py
similarity index 100%
rename from main.py
rename to lm_eval/main.py
diff --git a/pyproject.toml b/pyproject.toml
index 8fa30cdc..525285d0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,8 +47,8 @@ lm_eval = ["**/*.yaml", "tasks/**/*"]
 examples = ["**/*.yaml"]
 
 [project.scripts]
-lm-eval = "main:main"
-lm_eval = "main:main"
+lm-eval = "lm_eval.main:main"
+lm_eval = "lm_eval.main:main"
 
 [project.urls]
 Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
-- 
GitLab


From 3b5e554ffdf51506820e73972ba4373d3348516c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:09:48 +0000
Subject: [PATCH 072/212] move to __main__.py

---
 lm_eval/{main.py => __main__.py} | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)
 rename lm_eval/{main.py => __main__.py} (95%)

diff --git a/lm_eval/main.py b/lm_eval/__main__.py
similarity index 95%
rename from lm_eval/main.py
rename to lm_eval/__main__.py
index 96207884..edfb5d2c 100644
--- a/lm_eval/main.py
+++ b/lm_eval/__main__.py
@@ -12,10 +12,9 @@ from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from typing import Union
 
-
-def parse_args() -> argparse.Namespace:
+def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
     parser.add_argument(
@@ -100,8 +99,13 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def main() -> None:
-    args = parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     if args.limit:
         eval_logger.warning(
@@ -212,5 +216,5 @@ def main() -> None:
             print(evaluator.make_table(results, "groups"))
 
 
-if __name__ == "__main__":
-    main()
+if __name__ == "__main__":  
+    cli_evaluate()
-- 
GitLab


From 2e13caa64ce5a5195ceb81d7bb834df36f852807 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:11:06 +0000
Subject: [PATCH 073/212] update commandline script

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 525285d0..df668fb3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,8 +47,8 @@ lm_eval = ["**/*.yaml", "tasks/**/*"]
 examples = ["**/*.yaml"]
 
 [project.scripts]
-lm-eval = "lm_eval.main:main"
-lm_eval = "lm_eval.main:main"
+lm-eval = "lm_eval.__main__:cli_evaluate"
+lm_eval = "lm_eval.__main__:cli_evaluate"
 
 [project.urls]
 Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
-- 
GitLab


From 0856828fc3d8d0638570644f424156d68be119e1 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:16:35 +0000
Subject: [PATCH 074/212] remove references to main.py

---
 docs/decontamination.md | 4 ++--
 docs/interface.md       | 2 +-
 docs/model_guide.md     | 4 ++--
 docs/new_task_guide.md  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/decontamination.md b/docs/decontamination.md
index 609303dd..f37f1fc1 100644
--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -2,11 +2,11 @@
 
 ## Usage
 
-Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
+Simply add a "--decontamination_ngrams_path" when running \__main\__.py. The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model gpt2 \
     --device 0 \
     --tasks sciq \
diff --git a/docs/interface.md b/docs/interface.md
index 029c635d..860dd1c0 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -4,7 +4,7 @@ This document details the interface exposed by `lm-eval` and provides details on
 
 ## Command-line Interface
 
-A majority of users run the library by cloning it from Github and running the `main.py` script.
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
 
 Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
 
diff --git a/docs/model_guide.md b/docs/model_guide.md
index 8ffbf609..cf79dd77 100644
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -70,9 +70,9 @@ smth smth tokenizer-agnostic
 
 Congrats on implementing your model! Now it's time to test it out.
 
-To make your model usable via the command line interface to `lm-eval` using `main.py`, you'll need to tell `lm-eval` what your model's name is.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.
 
-This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python main.py --model <name>` and alert `lm-eval` to the model's existence.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.
 
 ```python
 from lm_eval.api.registry import register_model
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 54745f47..cdbc8655 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -258,7 +258,7 @@ You can do this via adding the Python snippet
 from lm_eval.tasks import include_task_folder
 include_task_folder("/path/to/yaml/parent/folder")
 ```
-to the top of any Python file that is run or imported when performing evaluation, such as `main.py`.
+to the top of any Python file that is run or imported when performing evaluation, such as `\_\_main\_\_.py`.
 
 Passing `--tasks /path/to/yaml/file` is also accepted.
 
-- 
GitLab


From 9b06de5498c37e76b82a75f9a775530b382848f9 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:16:41 +0000
Subject: [PATCH 075/212] remove references to main.py

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index cfd0c75d..6e22436c 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,8 @@ We’d like your help to test it out! you can help by:
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
 
 If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
-- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
-- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
 
 Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
 
@@ -67,7 +67,7 @@ To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/model
 
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=EleutherAI/gpt-j-6B \
     --tasks hellaswag \
@@ -78,7 +78,7 @@ python main.py \
 Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
     --tasks lambada_openai,hellaswag \
@@ -91,7 +91,7 @@ Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregre
 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
     --tasks lambada_openai,hellaswag \
@@ -99,7 +99,7 @@ python main.py \
     --batch_size auto:4
 ```
 
-Alternatively, you can use `lm-eval` instead of `python main.py` to call lm eval from anywhere.
+Alternatively, you can use `lm-eval` or `lm_eval` instead of `python -m lm_eval` to call lm eval from anywhere.
 
 ### Multi-GPU Evaluation with Hugging Face `accelerate`
 
@@ -108,7 +108,7 @@ To parallelize evaluation of HuggingFace models across multiple GPUs, we allow f
 The first is performed by launching evaluation via the `accelerate` library as follows:
 
 ```
-accelerate launch main.py \
+accelerate launch -m lm_eval \
     --model hf \
     --tasks lambada_openai,arc_easy \
     --batch_size 16 \
@@ -121,7 +121,7 @@ If your model is *is too large to be run on a single one of your GPUs* then you
 
 We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=EleutherAI/pythia-12b,parallelize=True
     --tasks lambada_openai,arc_easy \
@@ -136,7 +136,7 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f
 
 Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
 
-**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**
+**Note that this option requires launching evaluation via `python -m lm_eval` rather than `accelerate launch -m lm_eval`.**
 
 To use `accelerate` with the `lm-eval` command, use
 ```
@@ -167,7 +167,7 @@ Our library supports language models served via the OpenAI Completions API as fo
 
 ```bash
 export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
-python main.py \
+python -m lm_eval \
     --model openai-completions \
     --model_args engine=davinci \
     --tasks lambada_openai,hellaswag
@@ -198,7 +198,7 @@ This will write out one text file for each task.
 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model openai \
     --model_args engine=davinci \
     --tasks lambada_openai,hellaswag \
@@ -209,7 +209,7 @@ python main.py \
 
 For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
 ```bash
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
     --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
@@ -219,7 +219,7 @@ python main.py \
 [GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:
 
 ```bash
-python main.py \
+python -m lm_eval \
     --model hf \
     --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
     --tasks hellaswag
-- 
GitLab


From 436b2697f70cef7b7cd836a2359b13918f2b6531 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:17:01 +0000
Subject: [PATCH 076/212] also, reenable CPU tests

---
 .github/workflows/new_tasks.yml  | 136 +++++++++++++++----------------
 .github/workflows/unit_tests.yml |  64 +++++++--------
 2 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
index fb406988..ebb28a30 100644
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -1,72 +1,72 @@
-# name: Tasks Modified
+name: Tasks Modified
 
-# on:
-#   push:
-#     branches:
-#       - 'big-refactor*'
-#   pull_request:
-#     branches:
-#       - 'big-refactor*'
-#   workflow_dispatch:
-# # comment/edit out the above to stop/change the triggers
-# jobs:
-#   changed_files:
-#     runs-on: ubuntu-latest  # windows-latest || macos-latest
-#     timeout-minutes: 120
-#     name: Scan for changed tasks
-#     steps:
-#       - name: checkout
-#         uses: actions/checkout@v3
-#         with:
-#           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+on:
+  push:
+    branches:
+      - 'big-refactor*'
+  pull_request:
+    branches:
+      - 'big-refactor*'
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
 
-#       # Uses the tj-actions/changed-files@v37 action to check for changes.
-#       # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
-#       # The `files_yaml` input optionally takes a yaml string to specify filters,
-#       # and prepends the filter name to the standard output names.
-#       - name: Check task folders
-#         id: changed-tasks
-#         uses: tj-actions/changed-files@v37.1.2
-#         with:
-#           # tasks checks the tasks folder and api checks the api folder for changes
-#           files_yaml: |
-#             tasks:
-#               - lm_eval/tasks/**
-#             api:
-#               - lm_eval/api/**
-#           write_output_files: true
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v37.1.2
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
 
-#     # The next step is optional; the files are written to the workspace by default (above).
-#     # so it's just for debugging
-#       - name: Run Tests
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
-#           echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
-#           echo "One or more test file(s) has changed."
-#           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
 
-#       - name: Set up Python 3.9
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         uses: actions/setup-python@v4
-#         with:
-#           python-version: 3.9
-#           cache: 'pip'
-#           cache-dependency-path: setup.py
-#       - name: Install dependencies
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
-#             python -m pip install --upgrade pip
-#             pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-#     #   Install optional git dependencies
-#     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-#     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#       - name: Test with pytest
-#         # if new tasks are added, run tests on them
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
-#         # if api is modified, run tests on it
-#       - name: Test more tasks with pytest
-#         env:
-#           API: true
-#         if: steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+          cache-dependency-path: setup.py
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index c56347dd..4f105d09 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -43,35 +43,35 @@ jobs:
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-# Job 2
-#   testcpu:
-#     name: CPU Tests
-#     runs-on: ubuntu-latest
-#     strategy:
-#       matrix:
-#         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-#     timeout-minutes: 30
-#     steps:
-#     - name: Checkout Code
-#       uses: actions/checkout@v3
-#     - name: Set up Python ${{ matrix.python-version }}
-#       uses: actions/setup-python@v4
-#       with:
-#         python-version: ${{ matrix.python-version }}
-#         cache: pip
-#         cache-dependency-path: setup.py
-#     - name: Install dependencies
-#       run: |
-#         python -m pip install --upgrade pip
-#         pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-# #         Install optional git dependencies
-# #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-# #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#     - name: Test with pytest
-#       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
-#     - name: Archive artifacts
-#       uses: actions/upload-artifact@v3
-#       with:
-#         name: output_results
-#         path: |
-#           test_logs/*
+Job 2
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: setup.py
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+#         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
-- 
GitLab


From 54498fcc65167f2afb71a348a6deaf7bfd927bca Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:35:31 +0000
Subject: [PATCH 077/212] change back to original dataset path

---
 lm_eval/tasks/bigbench/greedy_until_template_yaml    | 2 +-
 lm_eval/tasks/bigbench/multiple_choice_template_yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
index 3de59b2b..130500cc 100644
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
@@ -1,5 +1,5 @@
 group: bigbench
-dataset_path: hails/bigbench
+dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
 output_type: greedy_until
 dataset_kwargs:
   # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
index 3a8185e8..3de7b5b7 100644
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -1,5 +1,5 @@
 group: bigbench
-dataset_path: hails/bigbench
+dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
 dataset_kwargs:
   # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
   # subtask_name: null
-- 
GitLab


From 51a43a768ea527e9313fb28005c59bdd936e862c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 14:46:10 +0000
Subject: [PATCH 078/212] change back to original dataset path

---
 lm_eval/tasks/bigbench/push_bigbench_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lm_eval/tasks/bigbench/push_bigbench_dataset.py b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
index acc37465..7566a664 100644
--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -2,9 +2,9 @@
 A utility script that pushes all Bigbench subtasks from their form in the `bigbench` HF dataset
 into `{org name}/bigbench`.
 
-Prior to running, must log into HF Hub for the target HF hub org via `huggingface-cli login`.
+Prior to running, log into HF Hub for the target HF hub org via `huggingface-cli login`.
 
-Requires the installation of 
+Requires the installation of
 `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 and is included so that the bigbench dependency can be avoided.
 """
@@ -20,7 +20,7 @@ num_shots = [0]
 
 for shots in num_shots:
     for task_name in tqdm(all_task_names):
-        try: 
+        try:
             print(f"Loading '{task_name}' with num_shots={shots}...")
             task_ds = datasets.load_dataset("bigbench", name=task_name, num_shots=shots)
 
@@ -29,4 +29,4 @@ for shots in num_shots:
 
             del task_ds
         except Exception as e:
-            raise e
\ No newline at end of file
+            raise e
-- 
GitLab


From 09d935eeb3e0c21c095d5ca1df5f711746196d46 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 4 Oct 2023 21:49:20 +0000
Subject: [PATCH 079/212] fix wildcards

---
 lm_eval/__main__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index edfb5d2c..2eb191ef 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -14,6 +14,7 @@ from lm_eval.tasks import include_task_folder
 
 from typing import Union
 
+
 def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
@@ -136,8 +137,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                 if os.path.isfile(task):
                     config = utils.load_yaml_config(task)
                     task_names.append(config)
-                else:
-                    task_missing.append(task)
 
         if task_missing != []:
             missing = ", ".join(task_missing)
@@ -216,5 +215,5 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             print(evaluator.make_table(results, "groups"))
 
 
-if __name__ == "__main__":  
+if __name__ == "__main__":
     cli_evaluate()
-- 
GitLab


From 24ecd3866a80eb5e70b49556445c676018c9e538 Mon Sep 17 00:00:00 2001
From: ManuelFay <manuel.faysse@illuin.tech>
Date: Thu, 5 Oct 2023 16:37:22 +0200
Subject: [PATCH 080/212] change : to . in the prompt

---
 lm_eval/tasks/belebele/_default_template_yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
index 4dbea664..06fe8115 100644
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -1,6 +1,5 @@
 group: belebele
 dataset_path: facebook/belebele
-description: "Choose the best answer to the question.\n"
 test_split: test
 fewshot_split: test
 fewshot_config:
@@ -8,7 +7,7 @@ fewshot_config:
 output_type: multiple_choice
 should_decontaminate: true
 doc_to_decontamination_query: "{{question}}"
-doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA. {{mc_answer1}}\nB. {{mc_answer2}}\nC. {{mc_answer3}}\nD. {{mc_answer4}}\nAnswer："
+doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer："
 doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
 metric_list:
-- 
GitLab


From 20a54b3af209ea191051cdb8a8abb3b6b1996048 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 5 Oct 2023 15:49:48 +0000
Subject: [PATCH 081/212] removed print messages, added cot extraction strings

---
 lm_eval/tasks/__init__.py                                       | 2 --
 .../bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml      | 2 +-
 .../mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml  | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 64dd4fdb..c139d849 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -141,8 +141,6 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                 except Exception as error:
                     import traceback
 
-                    print("###")
-                    print(yaml_path)
                     eval_logger.warning(
                         "Failed to load config in\n"
                         f"                                 {yaml_path}\n"
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index b6574a4e..7ccf3699 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -18,5 +18,5 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "(?<=the answer is )(.*)(?=.)"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
       - function: "take_first"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
index c9b03734..0666018b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -9,7 +9,7 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
       - function: "take_first"
 generation_kwargs:
   until:
-- 
GitLab


From f2c396ab044d3d5f5087757051ecb0534eebea6a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 5 Oct 2023 15:50:19 +0000
Subject: [PATCH 082/212] pre-commit reformat

---
 lm_eval/tasks/bigbench/README.md         |   2 +-
 lm_eval/tasks/bigbench/generate_tasks.py | 351 ++++++++++++-----------
 2 files changed, 180 insertions(+), 173 deletions(-)

diff --git a/lm_eval/tasks/bigbench/README.md b/lm_eval/tasks/bigbench/README.md
index bfb7d457..be680eac 100644
--- a/lm_eval/tasks/bigbench/README.md
+++ b/lm_eval/tasks/bigbench/README.md
@@ -6,7 +6,7 @@ Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilitie
 
 Abstract: https://arxiv.org/abs/2206.04615
 
-The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities. 
+The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities.
 
 Homepage: https://github.com/google/BIG-bench
 
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index dbd7a959..00a8799e 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -2,179 +2,182 @@ import os
 import yaml
 
 all_subtasks = [
-    'abstract_narrative_understanding',
-    'anachronisms',
-    'analogical_similarity',
-    'analytic_entailment',
-    'arithmetic',
-    'ascii_word_recognition',
-    'authorship_verification',
-    'auto_categorization',
-    'auto_debugging',
-    'bbq_lite_json',
-    'bridging_anaphora_resolution_barqa',
-    'causal_judgment',
-    'cause_and_effect',
-    'checkmate_in_one',
-    'chess_state_tracking',
-    'chinese_remainder_theorem',
-    'cifar10_classification',
-    'code_line_description',
-    'codenames',
-    'color',
-    'common_morpheme',
-    'conceptual_combinations',
-    'conlang_translation',
-    'contextual_parametric_knowledge_conflicts',
-    'crash_blossom',
-    'crass_ai',
-    'cryobiology_spanish',
-    'cryptonite',
-    'cs_algorithms',
-    'dark_humor_detection',
-    'date_understanding',
-    'disambiguation_qa',
-    'discourse_marker_prediction',
-    'disfl_qa',
-    'dyck_languages',
-    'elementary_math_qa',
-    'emoji_movie',
-    'emojis_emotion_prediction',
-    'empirical_judgments',
-    'english_proverbs',
-    'english_russian_proverbs',
-    'entailed_polarity',
-    'entailed_polarity_hindi',
-    'epistemic_reasoning',
-    'evaluating_information_essentiality',
-    'fact_checker',
-    'fantasy_reasoning',
-    'few_shot_nlg',
-    'figure_of_speech_detection',
-    'formal_fallacies_syllogisms_negation',
-    'gem',
-    'gender_inclusive_sentences_german',
-    'general_knowledge',
-    'geometric_shapes',
-    'goal_step_wikihow',
-    'gre_reading_comprehension',
-    'hhh_alignment',
-    'hindi_question_answering',
-    'hindu_knowledge',
-    'hinglish_toxicity',
-    'human_organs_senses',
-    'hyperbaton',
-    'identify_math_theorems',
-    'identify_odd_metaphor',
-    'implicatures',
-    'implicit_relations',
-    'intent_recognition',
-    'international_phonetic_alphabet_nli',
-    'international_phonetic_alphabet_transliterate',
-    'intersect_geometry',
-    'irony_identification',
-    'kanji_ascii',
-    'kannada',
-    'key_value_maps',
-    'known_unknowns',
-    'language_games',
-    'language_identification',
-    'linguistic_mappings',
-    'linguistics_puzzles',
-    'list_functions',
-    'logic_grid_puzzle',
-    'logical_args',
-    'logical_deduction',
-    'logical_fallacy_detection',
-    'logical_sequence',
-    'mathematical_induction',
-    'matrixshapes',
-    'metaphor_boolean',
-    'metaphor_understanding',
-    'minute_mysteries_qa',
-    'misconceptions',
-    'misconceptions_russian',
-    'mnist_ascii',
-    'modified_arithmetic',
-    'moral_permissibility',
-    'movie_dialog_same_or_different',
-    'movie_recommendation',
-    'mult_data_wrangling',
-    'multiemo',
-    'natural_instructions',
-    'navigate',
-    'nonsense_words_grammar',
-    'novel_concepts',
-    'object_counting',
-    'odd_one_out',
-    'operators',
-    'paragraph_segmentation',
-    'parsinlu_qa',
-    'parsinlu_reading_comprehension',
-    'penguins_in_a_table',
-    'periodic_elements',
-    'persian_idioms',
-    'phrase_relatedness',
-    'physical_intuition',
-    'physics',
-    'physics_questions',
-    'play_dialog_same_or_different',
-    'polish_sequence_labeling',
-    'presuppositions_as_nli',
-    'qa_wikidata',
-    'question_selection',
-    'real_or_fake_text',
-    'reasoning_about_colored_objects',
-    'repeat_copy_logic',
-    'rephrase',
-    'riddle_sense',
-    'ruin_names',
-    'salient_translation_error_detection',
-    'scientific_press_release',
-    'semantic_parsing_in_context_sparc',
-    'semantic_parsing_spider',
-    'sentence_ambiguity',
-    'similarities_abstraction',
-    'simp_turing_concept',
-    'simple_arithmetic_json',
-    'simple_arithmetic_json_multiple_choice',
-    'simple_arithmetic_json_subtasks',
-    'simple_arithmetic_multiple_targets_json',
-    'simple_ethical_questions',
-    'simple_text_editing',
-    'snarks',
-    'social_iqa',
-    'social_support',
-    'sports_understanding',
-    'strange_stories',
-    'strategyqa',
-    'sufficient_information',
-    'suicide_risk',
-    'swahili_english_proverbs',
-    'swedish_to_german_proverbs',
-    'symbol_interpretation',
-    'temporal_sequences',
-    'tense',
-    'timedial',
-    'topical_chat',
-    'tracking_shuffled_objects',
-    'understanding_fables',
-    'undo_permutation',
-    'unit_conversion',
-    'unit_interpretation',
-    'unnatural_in_context_learning',
-    'vitaminc_fact_verification',
-    'what_is_the_tao',
-    'which_wiki_edit',
-    'winowhy',
-    'word_sorting',
-    'word_unscrambling'
-    ]
+    "abstract_narrative_understanding",
+    "anachronisms",
+    "analogical_similarity",
+    "analytic_entailment",
+    "arithmetic",
+    "ascii_word_recognition",
+    "authorship_verification",
+    "auto_categorization",
+    "auto_debugging",
+    "bbq_lite_json",
+    "bridging_anaphora_resolution_barqa",
+    "causal_judgment",
+    "cause_and_effect",
+    "checkmate_in_one",
+    "chess_state_tracking",
+    "chinese_remainder_theorem",
+    "cifar10_classification",
+    "code_line_description",
+    "codenames",
+    "color",
+    "common_morpheme",
+    "conceptual_combinations",
+    "conlang_translation",
+    "contextual_parametric_knowledge_conflicts",
+    "crash_blossom",
+    "crass_ai",
+    "cryobiology_spanish",
+    "cryptonite",
+    "cs_algorithms",
+    "dark_humor_detection",
+    "date_understanding",
+    "disambiguation_qa",
+    "discourse_marker_prediction",
+    "disfl_qa",
+    "dyck_languages",
+    "elementary_math_qa",
+    "emoji_movie",
+    "emojis_emotion_prediction",
+    "empirical_judgments",
+    "english_proverbs",
+    "english_russian_proverbs",
+    "entailed_polarity",
+    "entailed_polarity_hindi",
+    "epistemic_reasoning",
+    "evaluating_information_essentiality",
+    "fact_checker",
+    "fantasy_reasoning",
+    "few_shot_nlg",
+    "figure_of_speech_detection",
+    "formal_fallacies_syllogisms_negation",
+    "gem",
+    "gender_inclusive_sentences_german",
+    "general_knowledge",
+    "geometric_shapes",
+    "goal_step_wikihow",
+    "gre_reading_comprehension",
+    "hhh_alignment",
+    "hindi_question_answering",
+    "hindu_knowledge",
+    "hinglish_toxicity",
+    "human_organs_senses",
+    "hyperbaton",
+    "identify_math_theorems",
+    "identify_odd_metaphor",
+    "implicatures",
+    "implicit_relations",
+    "intent_recognition",
+    "international_phonetic_alphabet_nli",
+    "international_phonetic_alphabet_transliterate",
+    "intersect_geometry",
+    "irony_identification",
+    "kanji_ascii",
+    "kannada",
+    "key_value_maps",
+    "known_unknowns",
+    "language_games",
+    "language_identification",
+    "linguistic_mappings",
+    "linguistics_puzzles",
+    "list_functions",
+    "logic_grid_puzzle",
+    "logical_args",
+    "logical_deduction",
+    "logical_fallacy_detection",
+    "logical_sequence",
+    "mathematical_induction",
+    "matrixshapes",
+    "metaphor_boolean",
+    "metaphor_understanding",
+    "minute_mysteries_qa",
+    "misconceptions",
+    "misconceptions_russian",
+    "mnist_ascii",
+    "modified_arithmetic",
+    "moral_permissibility",
+    "movie_dialog_same_or_different",
+    "movie_recommendation",
+    "mult_data_wrangling",
+    "multiemo",
+    "natural_instructions",
+    "navigate",
+    "nonsense_words_grammar",
+    "novel_concepts",
+    "object_counting",
+    "odd_one_out",
+    "operators",
+    "paragraph_segmentation",
+    "parsinlu_qa",
+    "parsinlu_reading_comprehension",
+    "penguins_in_a_table",
+    "periodic_elements",
+    "persian_idioms",
+    "phrase_relatedness",
+    "physical_intuition",
+    "physics",
+    "physics_questions",
+    "play_dialog_same_or_different",
+    "polish_sequence_labeling",
+    "presuppositions_as_nli",
+    "qa_wikidata",
+    "question_selection",
+    "real_or_fake_text",
+    "reasoning_about_colored_objects",
+    "repeat_copy_logic",
+    "rephrase",
+    "riddle_sense",
+    "ruin_names",
+    "salient_translation_error_detection",
+    "scientific_press_release",
+    "semantic_parsing_in_context_sparc",
+    "semantic_parsing_spider",
+    "sentence_ambiguity",
+    "similarities_abstraction",
+    "simp_turing_concept",
+    "simple_arithmetic_json",
+    "simple_arithmetic_json_multiple_choice",
+    "simple_arithmetic_json_subtasks",
+    "simple_arithmetic_multiple_targets_json",
+    "simple_ethical_questions",
+    "simple_text_editing",
+    "snarks",
+    "social_iqa",
+    "social_support",
+    "sports_understanding",
+    "strange_stories",
+    "strategyqa",
+    "sufficient_information",
+    "suicide_risk",
+    "swahili_english_proverbs",
+    "swedish_to_german_proverbs",
+    "symbol_interpretation",
+    "temporal_sequences",
+    "tense",
+    "timedial",
+    "topical_chat",
+    "tracking_shuffled_objects",
+    "understanding_fables",
+    "undo_permutation",
+    "unit_conversion",
+    "unit_interpretation",
+    "unnatural_in_context_learning",
+    "vitaminc_fact_verification",
+    "what_is_the_tao",
+    "which_wiki_edit",
+    "winowhy",
+    "word_sorting",
+    "word_unscrambling",
+]
 
 
 def main() -> None:
 
-    for path, task_type in zip(["multiple_choice", "greedy_until"], ["multiple_choice_template_yaml", "greedy_until_template_yaml"]):
+    for path, task_type in zip(
+        ["multiple_choice", "greedy_until"],
+        ["multiple_choice_template_yaml", "greedy_until_template_yaml"],
+    ):
         os.makedirs(path, exist_ok=True)
         for task in all_subtasks:
             file_name = f"{task}.yaml"
@@ -184,11 +187,15 @@ def main() -> None:
                     yaml.dump(
                         {
                             "include": f"../{task_type}",
-                            "task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]),
-                            "dataset_name": task + "_zero_shot", # zero-shot version of the dataset
+                            "task": "bigbench_"
+                            + task
+                            + "_{}".format(task_type.split("_template_yaml")[0]),
+                            "dataset_name": task
+                            + "_zero_shot",  # zero-shot version of the dataset
                         },
                         f,
-                        width=float("inf"), allow_unicode=True
+                        width=float("inf"),
+                        allow_unicode=True,
                     )
             except FileExistsError:
                 pass
-- 
GitLab


From 2493c6ccc7605933275ed7cb34221b8d8086eb33 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Fri, 6 Oct 2023 08:01:25 +0000
Subject: [PATCH 083/212] changed default to 5-shot

---
 lm_eval/tasks/benchmarks/flan/flan_held_out.yaml         | 9 +++++----
 lm_eval/tasks/mmlu/default/_default_template_yaml        | 1 +
 .../mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml | 1 +
 .../flan_n_shot/_mmlu_flan_loglikelihood_template_yaml   | 1 +
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
index 150e9477..e1429cda 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
@@ -3,10 +3,11 @@ task:
   # BBH
   - bbh_flan_zeroshot
   - bbh_flan_fewshot
-  # - bbh_flan_cot_fewshot
-  # - bbh_flan_cot_zeroshot
+  - bbh_flan_cot_fewshot
+  - bbh_flan_cot_zeroshot
   # MMLU
+  - mmlu
   - mmlu_flan_n_shot_generative
   - mmlu_flan_n_shot_loglikelihood
-  # - mmlu_flan_cot_zeroshot
-  # - mmlu_flan_cot_fewshot
+  - mmlu_flan_cot_zeroshot
+  - mmlu_flan_cot_fewshot
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index bd989c40..93aea1f6 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -15,3 +15,4 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
+num_fewshot: 5
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index b1ff96a8..8e39ff49 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -12,3 +12,4 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
+num_fewshot: 5
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 5db2981a..15196e7e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -13,3 +13,4 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
+num_fewshot: 5
-- 
GitLab


From 7d5e511c2d56e04bf7c94e66ac56c9e160c5493d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Fri, 6 Oct 2023 14:16:51 +0000
Subject: [PATCH 084/212] adjustments

---
 lm_eval/prompts/__init__.py                                  | 5 +++++
 lm_eval/tasks/mmlu/default/_default_template_yaml            | 1 -
 .../mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml     | 1 -
 .../mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml  | 1 -
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 68eeac6c..746dc3da 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -116,6 +116,11 @@ class PromptString:
 
         doc_to_text = self.prompt_string["doc_to_text"]
         doc_to_target = self.prompt_string["doc_to_target"]
+
+        # TODO need a way to process doc_to_choice
+        if "doc_to_choice" in self.prompt_string:
+            raise "Not yet implemented to accept doc_to_choice"
+
         text_string = utils.apply_template(doc_to_text, doc)
         target_string = utils.apply_template(doc_to_target, doc)
 
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index 93aea1f6..bd989c40 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -15,4 +15,3 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
-num_fewshot: 5
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index 8e39ff49..b1ff96a8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -12,4 +12,3 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
-num_fewshot: 5
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 15196e7e..5db2981a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -13,4 +13,3 @@ metric_list:
   - metric: acc_norm
     aggregation: mean
     higher_is_better: true
-num_fewshot: 5
-- 
GitLab


From 09d20bfa721f0a61ef0dca0de1ea6504ecbe8f80 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 6 Oct 2023 18:17:15 +0000
Subject: [PATCH 085/212] fix YML error in workflow

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 4f105d09..59d5e6d1 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -43,7 +43,7 @@ jobs:
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-Job 2
+# Job 2
   testcpu:
     name: CPU Tests
     runs-on: ubuntu-latest
-- 
GitLab


From e429b6d858e807abbc9eb6b14bf93544f0732bcf Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Sat, 7 Oct 2023 15:33:27 -0400
Subject: [PATCH 086/212] Added notable contributors to the citation block

I have looked through the commit history and added some notable contributors to the citation block. I probably missed some deserving individuals: exclusion from this list is not a judgement that someone should not be added.
---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 6e22436c..62dc36e0 100644
--- a/README.md
+++ b/README.md
@@ -248,16 +248,23 @@ You can also ask for help, or discuss new features with the maintainers in the #
 @software{eval-harness,
   author       = {Gao, Leo and
                   Tow, Jonathan and
+                  Abbasi, Baber and
                   Biderman, Stella and
                   Black, Sid and
                   DiPofi, Anthony and
                   Foster, Charles and
                   Golding, Laurence and
                   Hsu, Jeffrey and
+                  Le Noac'h, Alain and
+                  Li, Haonan and
                   McDonell, Kyle and
                   Muennighoff, Niklas and
+                  Ociepa, Chris
                   Phang, Jason and
                   Reynolds, Laria and
+                  Schoelkopf, Hailey and
+                  Skowron, Aviya and
+                  Sutawika, Lintang and
                   Tang, Eric and
                   Thite, Anish and
                   Wang, Ben and
-- 
GitLab


From 4d8094bb351682574eeb589a7f2155929e259c2a Mon Sep 17 00:00:00 2001
From: baberabb <92168766+baberabb@users.noreply.github.com>
Date: Mon, 9 Oct 2023 00:34:09 +0500
Subject: [PATCH 087/212] Improve error logging

---
 .pre-commit-config.yaml   |  1 -
 lm_eval/__main__.py       | 21 +++++++++++----------
 lm_eval/api/task.py       | 12 +++++++-----
 lm_eval/tasks/__init__.py | 13 +++++++------
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8fa6aeaa..1888077a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,6 @@ repos:
     rev: 22.3.0
     hooks:
       - id: black
-        language_version: python3.8
   - repo: https://github.com/codespell-project/codespell
     rev: v2.1.0
     hooks:
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 1996d680..b8f0942a 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -101,7 +101,6 @@ def parse_eval_args() -> argparse.Namespace:
 
 
 def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-
     if not args:
         # we allow for args to be passed externally, else we parse them ourselves
         args = parse_eval_args()
@@ -132,19 +131,21 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         else:
             tasks_list = args.tasks.split(",")
             task_names = utils.pattern_match(tasks_list, ALL_TASKS)
-            task_missing = []
             for task in [task for task in tasks_list if task not in task_names]:
                 if os.path.isfile(task):
                     config = utils.load_yaml_config(task)
                     task_names.append(config)
-
-        if task_missing != []:
-            missing = ", ".join(task_missing)
-            eval_logger.error(
-                f"Tasks were not found: {missing}\n"
-                f"{SPACING}Try `lm-eval -h` for list of available tasks",
-            )
-            raise ValueError(f"Tasks {missing} were not found.")
+            task_missing = [task for task in tasks_list if task not in task_names]
+
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{SPACING}Try `lm-eval -h` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks."
+                )
 
     if args.output_path:
         path = Path(args.output_path)
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 5bc07b6a..d0f7d14b 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -99,7 +99,7 @@ class TaskConfig(dict):
         if self.generation_kwargs is not None:
             if self.output_type != "greedy_until":
                 eval_logger.warning(
-                    "passed `generation_kwargs`, but not using `output_type: greedy_until`!"
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: greedy_until`!"
                 )
                 assert self.output_type != "greedy_until"
 
@@ -759,7 +759,6 @@ class ConfigurableTask(Task):
             return super().fewshot_docs()
 
     def apply_filters(self):
-
         if hasattr(self, "_filters"):
             for f in self._filters:
                 f.apply(self._instances, self.task_docs)
@@ -967,7 +966,6 @@ class ConfigurableTask(Task):
         )
 
     def process_results(self, doc, results):
-
         if callable(self.config.process_results):
             return self.config.process_results(doc, results)
 
@@ -1104,7 +1102,9 @@ class ConfigurableTask(Task):
                                 predictions=[result],
                                 **self._metric_fn_kwargs[metric],
                             )
-                        except TypeError:  # TODO: this is hacky and I don't want to do it
+                        except (
+                            TypeError
+                        ):  # TODO: this is hacky and I don't want to do it
                             result_score = self._metric_fn_list[metric](
                                 [gold_option, result]
                             )
@@ -1123,7 +1123,9 @@ class ConfigurableTask(Task):
                             predictions=[result],
                             **self._metric_fn_kwargs[metric],
                         )
-                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except (
+                        TypeError
+                    ):  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                         result_score = self._metric_fn_list[metric]([gold, result])
                     if isinstance(result_score, dict):
                         # TODO: this handles the case where HF evaluate returns a dict.
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c139d849..0b124a67 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -27,7 +27,9 @@ def register_configurable_task(config: Dict[str, str]) -> int:
         register_task(task_name)(SubClass)
 
     if "group" in config:
-        if type(config["group"]) == str:
+        if config["group"] == config["task"]:
+            raise ValueError("task and group name cannot be the same")
+        elif type(config["group"]) == str:
             group_name = [config["group"]]
         else:
             group_name = config["group"]
@@ -45,7 +47,6 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
     task_list = [task for task in all_task_list if type(task) == str]
 
     for task_config in config_list:
-
         task_config = utils.load_yaml_config(yaml_path, task_config)
         var_configs = check_prompt_config(
             {
@@ -137,7 +138,10 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                         else:
                             if type(config["task"]) == list:
                                 register_configurable_group(config, yaml_path)
-
+                except ModuleNotFoundError as e:
+                    eval_logger.warning(
+                        f"{yaml_path}: {e}. Config will not be added to registry."
+                    )
                 except Exception as error:
                     import traceback
 
@@ -187,7 +191,6 @@ def get_task_name_from_object(task_object):
 
 # TODO: pass num_fewshot and other cmdline overrides in a better way
 def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
-
     config = {**kwargs}
 
     task_name_from_registry_dict = {}
@@ -199,7 +202,6 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
 
     for task_element in task_name_list:
         if isinstance(task_element, str):
-
             if task_element in GROUP_REGISTRY:
                 group_name = task_element
                 for task_name in GROUP_REGISTRY[task_element]:
@@ -237,7 +239,6 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
             }
 
         elif isinstance(task_element, Task):
-
             task_name_from_object_dict = {
                 **task_name_from_object_dict,
                 get_task_name_from_object(task_element): task_element,
-- 
GitLab


From e3400936b3f7438de8a6817510d30ab6aa778dd9 Mon Sep 17 00:00:00 2001
From: baberabb <92168766+baberabb@users.noreply.github.com>
Date: Mon, 9 Oct 2023 01:48:21 +0500
Subject: [PATCH 088/212] fix indentation

---
 docs/task_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/task_guide.md b/docs/task_guide.md
index d97e86d7..3e15fd9f 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -142,7 +142,7 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t
 - performing the same sequence of filters on these new sets of 8 responses, for each document.
 ```yaml
 - name: "maj@8"
-    filter:
+  filter:
     - function: "take_first_k"
       k: 8
     - function: "regex"
-- 
GitLab


From ad7921405f2fecb44d59e5c5cfc098901b803682 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 9 Oct 2023 09:02:23 +0000
Subject: [PATCH 089/212] removed

---
 lm_eval/filters/__init__.py   |  1 -
 lm_eval/filters/extraction.py | 42 -----------------------------------
 2 files changed, 43 deletions(-)

diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py
index 33dd0573..c74ac015 100644
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -10,7 +10,6 @@ FILTER_REGISTRY = {
     "majority_vote": selection.MajorityVoteFilter,
     "take_first_k": selection.TakeKFilter,
     "remove_whitespace": extraction.WhitespaceFilter,
-    "cot_filter": extraction.CoTFilter,
     "lowercase": transformation.LowercaseFilter,
     "uppercase": transformation.UppercaseFilter,
     "map": transformation.MapFilter,
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index bea6675b..345bf99b 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -60,45 +60,3 @@ class WhitespaceFilter(Filter):
         filtered_resps = [filter_set(resp) for resp in resps]
 
         return filtered_resps
-
-
-class CoTFilter(Filter):
-    """ """
-
-    def __init__(self):
-        pass
-
-    def apply(self, resps):
-        def filter_set(inst):
-
-            filtered_resp = []
-            for resp in inst:
-
-                resp = resp.strip()
-                if resp[-1] in [".", ",", "?", " ", "\n"]:
-                    resp = resp[:-1].strip()
-
-                if resp[0] == "(" and resp[-1] == ")":
-                    resp = resp[1:-1].strip()
-                    return resp
-                else:
-                    resp = resp.split("resp is")[-1].strip()
-                    resp = resp.split("final resp")[-1].strip()
-                    resp = resp.split("Final resp")[-1].strip()
-                    resp = resp.split("resp:")[-1].strip()
-                    resp = resp.split("resp:")[-1].strip()
-                    if resp and resp[0] in [".", ",", "?", " ", "\n", ":"]:
-                        resp = resp[1:].strip()
-                    if resp and resp[-1] in [".", ",", "?", " ", "\n", ":"]:
-                        resp = resp[:-1].strip()
-                    # corner case 2: is prediction is (B), should processed into B.
-                    if resp and resp[0] == "(" and resp[-1] == ")":
-                        resp = resp[1:-1].strip()
-
-                filtered_resp.append(resp)
-
-            return filtered_resp
-
-        filtered_resps = [filter_set(resp) for resp in resps]
-
-        return filtered_resps
-- 
GitLab


From bc1b0eca0de8d02e380477523a089d0c1c88f935 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 10 Oct 2023 08:11:52 +0000
Subject: [PATCH 090/212] removed delimiter=""

---
 lm_eval/__main__.py                           |  7 +++++++
 lm_eval/tasks/__init__.py                     | 21 +++++++++++--------
 .../persona/_template_yaml                    |  1 -
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 1996d680..cd5acdc8 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -97,6 +97,12 @@ def parse_eval_args() -> argparse.Namespace:
         default=None,
         help="Additional path to include if there are external tasks to include.",
     )
+    parser.add_argument(
+        "--verbose",
+        type=bool,
+        default=False,
+        help="Log error when tasks are not registered.",
+    )
     return parser.parse_args()
 
 
@@ -167,6 +173,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         assert args.output_path, "Specify --output_path"
 
     eval_logger.info(f"Selected Tasks: {task_names}")
+    eval_logger.verbose = args.verbose
 
     results = evaluator.simple_evaluate(
         model=args.model,
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c139d849..ed4f3242 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -139,15 +139,18 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                                 register_configurable_group(config, yaml_path)
 
                 except Exception as error:
-                    import traceback
-
-                    eval_logger.warning(
-                        "Failed to load config in\n"
-                        f"                                 {yaml_path}\n"
-                        "                                 Config will not be added to registry\n"
-                        f"                                 Error: {error}\n"
-                        f"                                 Traceback: {traceback.format_exc()}"
-                    )
+                    if eval_logger.verbose:
+                        import traceback
+
+                        eval_logger.warning(
+                            "Failed to load config in\n"
+                            f"                                 {yaml_path}\n"
+                            "                                 Config will not be added to registry\n"
+                            f"                                 Error: {error}\n"
+                            f"                                 Traceback: {traceback.format_exc()}"
+                        )
+                    else:
+                        eval_logger.warning("Yaml failed to register {yaml_path}\n")
     return 0
 
 
diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
index 34721df5..1d7ef279 100644
--- a/lm_eval/tasks/model_written_evals/persona/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -2,7 +2,6 @@ group: persona
 dataset_path: EleutherAI/persona
 output_type: multiple_choice
 validation_split: validation
-target_delimiter: ""
 doc_to_text: "{{question}}"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
-- 
GitLab


From 9894597c6d39e23b7d39c3d64f509a88864ceb53 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 10 Oct 2023 08:13:04 +0000
Subject: [PATCH 091/212] add verbose arg

---
 lm_eval/__main__.py       |  7 +++++++
 lm_eval/tasks/__init__.py | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 1996d680..cd5acdc8 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -97,6 +97,12 @@ def parse_eval_args() -> argparse.Namespace:
         default=None,
         help="Additional path to include if there are external tasks to include.",
     )
+    parser.add_argument(
+        "--verbose",
+        type=bool,
+        default=False,
+        help="Log error when tasks are not registered.",
+    )
     return parser.parse_args()
 
 
@@ -167,6 +173,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         assert args.output_path, "Specify --output_path"
 
     eval_logger.info(f"Selected Tasks: {task_names}")
+    eval_logger.verbose = args.verbose
 
     results = evaluator.simple_evaluate(
         model=args.model,
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c139d849..ed4f3242 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -139,15 +139,18 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                                 register_configurable_group(config, yaml_path)
 
                 except Exception as error:
-                    import traceback
-
-                    eval_logger.warning(
-                        "Failed to load config in\n"
-                        f"                                 {yaml_path}\n"
-                        "                                 Config will not be added to registry\n"
-                        f"                                 Error: {error}\n"
-                        f"                                 Traceback: {traceback.format_exc()}"
-                    )
+                    if eval_logger.verbose:
+                        import traceback
+
+                        eval_logger.warning(
+                            "Failed to load config in\n"
+                            f"                                 {yaml_path}\n"
+                            "                                 Config will not be added to registry\n"
+                            f"                                 Error: {error}\n"
+                            f"                                 Traceback: {traceback.format_exc()}"
+                        )
+                    else:
+                        eval_logger.warning("Yaml failed to register {yaml_path}\n")
     return 0
 
 
-- 
GitLab


From e33a7d92d68d33ed9fd6de4bcfae51742e7b68b4 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 10 Oct 2023 09:16:25 +0000
Subject: [PATCH 092/212] error set to DEBUG

---
 lm_eval/__main__.py       |  8 ++++----
 lm_eval/tasks/__init__.py | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index cd5acdc8..b4839f8c 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -98,9 +98,9 @@ def parse_eval_args() -> argparse.Namespace:
         help="Additional path to include if there are external tasks to include.",
     )
     parser.add_argument(
-        "--verbose",
-        type=bool,
-        default=False,
+        "--verbosity",
+        type=str,
+        default="INFO",
         help="Log error when tasks are not registered.",
     )
     return parser.parse_args()
@@ -112,6 +112,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         # we allow for args to be passed externally, else we parse them ourselves
         args = parse_eval_args()
 
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     if args.limit:
@@ -173,7 +174,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         assert args.output_path, "Specify --output_path"
 
     eval_logger.info(f"Selected Tasks: {task_names}")
-    eval_logger.verbose = args.verbose
 
     results = evaluator.simple_evaluate(
         model=args.model,
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index ed4f3242..6f0b2b38 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,7 +4,7 @@ from typing import List, Union, Dict
 
 from lm_eval import utils
 from lm_eval import prompts
-from lm_eval.logger import eval_logger
+# from lm_eval.logger import eval_logger
 from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
 from lm_eval.api.registry import (
     register_task,
@@ -14,6 +14,9 @@ from lm_eval.api.registry import (
     ALL_TASKS,
 )
 
+import logging
+
+eval_logger = logging.getLogger('lm-eval')
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
@@ -139,18 +142,15 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                                 register_configurable_group(config, yaml_path)
 
                 except Exception as error:
-                    if eval_logger.verbose:
-                        import traceback
-
-                        eval_logger.warning(
-                            "Failed to load config in\n"
-                            f"                                 {yaml_path}\n"
-                            "                                 Config will not be added to registry\n"
-                            f"                                 Error: {error}\n"
-                            f"                                 Traceback: {traceback.format_exc()}"
-                        )
-                    else:
-                        eval_logger.warning("Yaml failed to register {yaml_path}\n")
+                    import traceback
+
+                    eval_logger.debug(
+                        "Failed to load config in\n"
+                        f"                                 {yaml_path}\n"
+                        "                                 Config will not be added to registry\n"
+                        f"                                 Error: {error}\n"
+                        f"                                 Traceback: {traceback.format_exc()}"
+                    )
     return 0
 
 
-- 
GitLab


From 0d701496f38dc16c3fa918a8241f008566577343 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 10 Oct 2023 15:00:16 +0000
Subject: [PATCH 093/212] fixes and adjustments

---
 lm_eval/api/task.py                                           | 2 +-
 .../tasks/model_written_evals/advanced_ai_risk/_template_yaml | 3 ++-
 .../human-self-awareness-training-web-gpt.yaml                | 4 ----
 .../advanced_ai_risk/human-self-awareness-web-gpt.yaml        | 4 ++++
 lm_eval/tasks/model_written_evals/persona/_template_yaml      | 1 +
 .../sycophancy/sycophancy_on_nlp_survey.yaml                  | 1 +
 .../sycophancy/sycophancy_on_philpapers2020.yaml              | 1 +
 .../sycophancy/sycophancy_on_political_typology_quiz.yaml     | 1 +
 8 files changed, 11 insertions(+), 6 deletions(-)
 delete mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
 create mode 100644 lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 5bc07b6a..6ce0b3f1 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -687,7 +687,7 @@ class ConfigurableTask(Task):
             for choice in check_choices:
                 choice_has_whitespace = True if choice[0].isspace() else False
                 delimiter_has_whitespace = (
-                    True if self.config.target_delimiter[-1].isspace() else False
+                    True if (len(self.config.target_delimiter) >= 1 and self.config.target_delimiter[-1].isspace()) else False
                 )
 
                 if delimiter_has_whitespace and choice_has_whitespace:
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
index 92f7ac0c..9a2b36d6 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -1,7 +1,8 @@
 group: advanced_ai_risk
 dataset_path: EleutherAI/advanced_ai_risk
 output_type: multiple_choice
-validation_split: train
+validation_split: validation
+target_delimiter: ""
 doc_to_text: "Human: {question}\n\nAssistant:"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
deleted file mode 100644
index b1ab92cf..00000000
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-web-gpt.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by _generate_configs.py
-dataset_name: human-self-awareness-training-web-gpt
-include: _template_yaml
-task: advanced_ai_risk_human-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml
new file mode 100644
index 00000000..e34a4b9f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
index 1d7ef279..34721df5 100644
--- a/lm_eval/tasks/model_written_evals/persona/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -2,6 +2,7 @@ group: persona
 dataset_path: EleutherAI/persona
 output_type: multiple_choice
 validation_split: validation
+target_delimiter: ""
 doc_to_text: "{{question}}"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
index c4549a91..9c31c518 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -1,4 +1,5 @@
 group: sycophancy
+task: sycophancy_on_nlp_survey
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_nlp_survey
 output_type: multiple_choice
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
index a26abeb9..53589c87 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -1,4 +1,5 @@
 group: sycophancy
+task: sycophancy_on_philpapers2020
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_philpapers2020
 output_type: multiple_choice
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
index d3f2610c..f16d8807 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -1,4 +1,5 @@
 group: sycophancy
+task: sycophancy_on_political_typology_quiz
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_political_typology_quiz
 output_type: multiple_choice
-- 
GitLab


From 1aa3bc1e3e9cf4ba7182512f3a40ab6b80decbdf Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Wed, 11 Oct 2023 11:36:31 +0800
Subject: [PATCH 094/212] add _batch_scheduler in greedy_until

---
 lm_eval/models/huggingface.py | 58 ++++++++++++++++++++++-------------
 lm_eval/utils.py              |  2 +-
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 57dae974..cf3cfbbb 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -620,6 +620,25 @@ class HFLM(LM):
             loglikelihoods.append(string_nll)
 
         return loglikelihoods
+    
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(
+            n_reordered_requests, pos
+        )
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
 
     def _loglikelihood_tokens(
         self, requests, disable_tqdm: bool = False, override_bs=None
@@ -644,25 +663,6 @@ class HFLM(LM):
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
 
-        def _batch_scheduler(pos):
-            sched = pos // int(n_reordered_requests / self.batch_schedule)
-            if sched in self.batch_sizes:
-                return self.batch_sizes[sched]
-            if (len(self.batch_sizes) > 1) and (
-                self.batch_sizes[sched - 1] == self.max_batch_size
-            ):
-                # if previous batch size is already maximal, skip recomputation
-                self.batch_sizes[sched] = self.max_batch_size
-                return self.batch_sizes[sched]
-            print(
-                f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
-            )
-            self.batch_sizes[sched] = self._detect_batch_size(
-                re_ord.get_reordered(), pos
-            )
-            print(f"Determined largest batch size: {self.batch_sizes[sched]}")
-            return self.batch_sizes[sched]
-
         for chunk in utils.chunks(
             tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
             n=self.batch_size
@@ -670,7 +670,7 @@ class HFLM(LM):
             else override_bs
             if override_bs is not None
             else 0,
-            fn=_batch_scheduler
+            fn=self._batch_scheduler
             if self.batch_size == "auto"
             and n_reordered_requests > 0
             and not override_bs
@@ -838,13 +838,27 @@ class HFLM(LM):
             re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
 
         pbar = tqdm(total=len(requests), disable=(self.rank != 0))
-
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
         # for each different set of kwargs, we execute all requests, by batch.
         for key, re_ord in re_ords.items():
             for chunk in utils.chunks(
                 re_ord.get_reordered(),
-                self.batch_size,
+                n=self.batch_size
+                if self.batch_size != "auto"
+                else adaptive_batch_size
+                if adaptive_batch_size is not None
+                else 0,
+                fn=self._batch_scheduler
+                if self.batch_size == "auto"
+                and not adaptive_batch_size
+                else None,
             ):
+
                 contexts, all_gen_kwargs = zip(*chunk)
                 # we assume all gen kwargs in the batch are the same
                 # this is safe to assume because the `grouper` object ensures it.
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 356fdf7b..d246470a 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -78,7 +78,7 @@ def chunks(iter, n: int = 0, fn=None):
     arr = []
     for i, x in enumerate(iter):
         arr.append(x)
-        if len(arr) == (fn(i) if fn else n):
+        if len(arr) == (fn(i, iter) if fn else n):
             yield arr
             arr = []
 
-- 
GitLab


From 2bd5dcb69d0eef6495005d63b7f126222bc2ffb8 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Wed, 11 Oct 2023 11:54:55 +0800
Subject: [PATCH 095/212] finished test code

---
 lm_eval/models/huggingface.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index cf3cfbbb..fd288c72 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -847,7 +847,7 @@ class HFLM(LM):
         # for each different set of kwargs, we execute all requests, by batch.
         for key, re_ord in re_ords.items():
             for chunk in utils.chunks(
-                re_ord.get_reordered(),
+                tqdm(re_ord.get_reordered(), disable=self.rank != 0),
                 n=self.batch_size
                 if self.batch_size != "auto"
                 else adaptive_batch_size
@@ -858,7 +858,6 @@ class HFLM(LM):
                 and not adaptive_batch_size
                 else None,
             ):
-
                 contexts, all_gen_kwargs = zip(*chunk)
                 # we assume all gen kwargs in the batch are the same
                 # this is safe to assume because the `grouper` object ensures it.
-- 
GitLab


From 660dfb71d15ad0e4fe5ddd3f4a36373d49871da8 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Wed, 11 Oct 2023 18:20:42 +0800
Subject: [PATCH 096/212] check with pre-commit

---
 .github/workflows/unit_tests.yml | 2 +-
 lm_eval/models/huggingface.py    | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 4f105d09..38c9a82b 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -43,7 +43,7 @@ jobs:
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-Job 2
+Job 2:
   testcpu:
     name: CPU Tests
     runs-on: ubuntu-latest
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index fd288c72..d4b4c9b6 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -620,7 +620,7 @@ class HFLM(LM):
             loglikelihoods.append(string_nll)
 
         return loglikelihoods
-    
+
     def _batch_scheduler(self, pos, n_reordered_requests):
         sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
         if sched in self.batch_sizes:
@@ -634,9 +634,7 @@ class HFLM(LM):
         print(
             f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
         )
-        self.batch_sizes[sched] = self._detect_batch_size(
-            n_reordered_requests, pos
-        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
         print(f"Determined largest batch size: {self.batch_sizes[sched]}")
         return self.batch_sizes[sched]
 
@@ -854,8 +852,7 @@ class HFLM(LM):
                 if adaptive_batch_size is not None
                 else 0,
                 fn=self._batch_scheduler
-                if self.batch_size == "auto"
-                and not adaptive_batch_size
+                if self.batch_size == "auto" and not adaptive_batch_size
                 else None,
             ):
                 contexts, all_gen_kwargs = zip(*chunk)
-- 
GitLab


From 053e1f5f0ca4ebaab9a06bef5e67c67ebd4d189c Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 12 Oct 2023 15:53:05 +0700
Subject: [PATCH 097/212] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index df668fb3..c545a52a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ packages = ["lm_eval"]
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-lm_eval = ["**/*.yaml", "tasks/**/*"]
+lm_eval = ["**/*.yaml", "lm_eval/**/*"]
 examples = ["**/*.yaml"]
 
 [project.scripts]
-- 
GitLab


From 59204667be71c8b878a6a76c0f808924a8a19598 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 12 Oct 2023 16:04:33 +0700
Subject: [PATCH 098/212] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c545a52a..0689c34c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ packages = ["lm_eval"]
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-lm_eval = ["**/*.yaml", "lm_eval/**/*"]
+lm_eval = ["**/*.yaml", "api/**/*", "decontamination/**/*", "filters/**/*", "models/**/*", "prompts/**/*", "tasks/**/*"]
 examples = ["**/*.yaml"]
 
 [project.scripts]
-- 
GitLab


From 32a6362cff11118492ff6899180fcec581dd122d Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 12 Oct 2023 19:22:56 +0700
Subject: [PATCH 099/212] Update __init__.py

---
 lm_eval/tasks/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 6f0b2b38..d2add084 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,7 +4,6 @@ from typing import List, Union, Dict
 
 from lm_eval import utils
 from lm_eval import prompts
-# from lm_eval.logger import eval_logger
 from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
 from lm_eval.api.registry import (
     register_task,
-- 
GitLab


From 10625bd808c7dbd26e048808ac8fce6156089af2 Mon Sep 17 00:00:00 2001
From: Jason Krone <jasonkrone@me.com>
Date: Fri, 13 Oct 2023 10:46:10 -0700
Subject: [PATCH 100/212] Fix "TypeError: 'tqdm' object is not subscriptable"
 error that occurs in hugging face model loglikelihood_tokens and greedy_util
 functions when batch-size is set to auto

---
 lm_eval/models/huggingface.py | 37 ++++++++++++-----------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index d4b4c9b6..39eab765 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -661,19 +661,13 @@ class HFLM(LM):
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
 
-        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
-            n=self.batch_size
-            if self.batch_size != "auto"
-            else override_bs
-            if override_bs is not None
-            else 0,
-            fn=self._batch_scheduler
-            if self.batch_size == "auto"
-            and n_reordered_requests > 0
-            and not override_bs
-            else None,
-        ):
+        chunks = utils.chunks(
+            re_ord.get_reordered(),
+            n=self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0,
+            fn=self._batch_scheduler if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs else None,
+        )
+
+        for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
             inps = []
             cont_toks_list = []
             inplens = []
@@ -844,17 +838,12 @@ class HFLM(LM):
             adaptive_batch_size = batch_size
         # for each different set of kwargs, we execute all requests, by batch.
         for key, re_ord in re_ords.items():
-            for chunk in utils.chunks(
-                tqdm(re_ord.get_reordered(), disable=self.rank != 0),
-                n=self.batch_size
-                if self.batch_size != "auto"
-                else adaptive_batch_size
-                if adaptive_batch_size is not None
-                else 0,
-                fn=self._batch_scheduler
-                if self.batch_size == "auto" and not adaptive_batch_size
-                else None,
-            ):
+            chunks = utils.chunks(
+                re_ord.get_reordered(),
+                n=self.batch_size if self.batch_size != "auto" else adaptive_batch_size if adaptive_batch_size is not None else 0,
+                fn=self._batch_scheduler if self.batch_size == "auto" and not adaptive_batch_size else None,
+            )
+            for chunk in tqdm(chunks, disable=self.rank != 0):
                 contexts, all_gen_kwargs = zip(*chunk)
                 # we assume all gen kwargs in the batch are the same
                 # this is safe to assume because the `grouper` object ensures it.
-- 
GitLab


From ec581e8ef2f089686199780a0041c76eb8f0f078 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 13 Oct 2023 20:10:27 -0400
Subject: [PATCH 101/212] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62dc36e0..9c6a054f 100644
--- a/README.md
+++ b/README.md
@@ -232,7 +232,7 @@ We support wildcards in task names, for example you can run all of the machine-t
 To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
 
 
-As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md and https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md and welcome contributions of novel task templates and task variants.
+As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in [the task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md) and [the advanced task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md) and welcome contributions of novel task templates and task variants.
 
 ## How to Contribute or Learn More?
 
-- 
GitLab


From 92f2546392e211423508bf90627a96a31567e911 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 16 Oct 2023 04:42:20 +0000
Subject: [PATCH 102/212] modfied to add subcategory

---
 lm_eval/tasks/mmlu/_generate_configs.py       | 146 ++++++++++--------
 .../tasks/mmlu/default/_default_template_yaml |   1 -
 lm_eval/tasks/mmlu/default/mmlu.yaml          |   6 +
 .../mmlu/default/mmlu_abstract_algebra.yaml   |   1 +
 lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml  |   1 +
 .../tasks/mmlu/default/mmlu_astronomy.yaml    |   1 +
 .../mmlu/default/mmlu_business_ethics.yaml    |   1 +
 .../mmlu/default/mmlu_clinical_knowledge.yaml |   1 +
 .../mmlu/default/mmlu_college_biology.yaml    |   1 +
 .../mmlu/default/mmlu_college_chemistry.yaml  |   1 +
 .../mmlu_college_computer_science.yaml        |   1 +
 .../default/mmlu_college_mathematics.yaml     |   1 +
 .../mmlu/default/mmlu_college_medicine.yaml   |   1 +
 .../mmlu/default/mmlu_college_physics.yaml    |   1 +
 .../mmlu/default/mmlu_computer_security.yaml  |   1 +
 .../mmlu/default/mmlu_conceptual_physics.yaml |   1 +
 .../tasks/mmlu/default/mmlu_econometrics.yaml |   1 +
 .../default/mmlu_electrical_engineering.yaml  |   1 +
 .../default/mmlu_elementary_mathematics.yaml  |   1 +
 .../tasks/mmlu/default/mmlu_formal_logic.yaml |   1 +
 .../tasks/mmlu/default/mmlu_global_facts.yaml |   1 +
 .../default/mmlu_high_school_biology.yaml     |   1 +
 .../default/mmlu_high_school_chemistry.yaml   |   1 +
 .../mmlu_high_school_computer_science.yaml    |   1 +
 .../mmlu_high_school_european_history.yaml    |   1 +
 .../default/mmlu_high_school_geography.yaml   |   1 +
 ...u_high_school_government_and_politics.yaml |   1 +
 .../mmlu_high_school_macroeconomics.yaml      |   1 +
 .../default/mmlu_high_school_mathematics.yaml |   1 +
 .../mmlu_high_school_microeconomics.yaml      |   1 +
 .../default/mmlu_high_school_physics.yaml     |   1 +
 .../default/mmlu_high_school_psychology.yaml  |   1 +
 .../default/mmlu_high_school_statistics.yaml  |   1 +
 .../default/mmlu_high_school_us_history.yaml  |   1 +
 .../mmlu_high_school_world_history.yaml       |   1 +
 .../tasks/mmlu/default/mmlu_human_aging.yaml  |   1 +
 .../mmlu/default/mmlu_human_sexuality.yaml    |   1 +
 .../mmlu/default/mmlu_international_law.yaml  |   1 +
 .../mmlu/default/mmlu_jurisprudence.yaml      |   1 +
 .../mmlu/default/mmlu_logical_fallacies.yaml  |   1 +
 .../mmlu/default/mmlu_machine_learning.yaml   |   1 +
 .../tasks/mmlu/default/mmlu_management.yaml   |   1 +
 .../tasks/mmlu/default/mmlu_marketing.yaml    |   1 +
 .../mmlu/default/mmlu_medical_genetics.yaml   |   1 +
 .../mmlu/default/mmlu_miscellaneous.yaml      |   1 +
 .../mmlu/default/mmlu_moral_disputes.yaml     |   1 +
 .../mmlu/default/mmlu_moral_scenarios.yaml    |   1 +
 .../tasks/mmlu/default/mmlu_nutrition.yaml    |   1 +
 .../tasks/mmlu/default/mmlu_philosophy.yaml   |   1 +
 .../tasks/mmlu/default/mmlu_prehistory.yaml   |   1 +
 .../default/mmlu_professional_accounting.yaml |   1 +
 .../mmlu/default/mmlu_professional_law.yaml   |   1 +
 .../default/mmlu_professional_medicine.yaml   |   1 +
 .../default/mmlu_professional_psychology.yaml |   1 +
 .../mmlu/default/mmlu_public_relations.yaml   |   1 +
 .../mmlu/default/mmlu_security_studies.yaml   |   1 +
 .../tasks/mmlu/default/mmlu_sociology.yaml    |   1 +
 .../mmlu/default/mmlu_us_foreign_policy.yaml  |   1 +
 lm_eval/tasks/mmlu/default/mmlu_virology.yaml |   1 +
 .../mmlu/default/mmlu_world_religions.yaml    |   1 +
 60 files changed, 147 insertions(+), 63 deletions(-)
 create mode 100644 lm_eval/tasks/mmlu/default/mmlu.yaml

diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 542e11b2..b34a39ea 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -1,5 +1,5 @@
 """
-Take in a YAML, and output all other splits with this YAML
+Take in a YAML, and output all "other" splits with this YAML
 """
 import os
 import yaml
@@ -10,65 +10,65 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger
 
-SUBJECTS = [
-    "abstract_algebra",
-    "anatomy",
-    "astronomy",
-    "business_ethics",
-    "clinical_knowledge",
-    "college_biology",
-    "college_chemistry",
-    "college_computer_science",
-    "college_mathematics",
-    "college_medicine",
-    "college_physics",
-    "computer_security",
-    "conceptual_physics",
-    "econometrics",
-    "electrical_engineering",
-    "elementary_mathematics",
-    "formal_logic",
-    "global_facts",
-    "high_school_biology",
-    "high_school_chemistry",
-    "high_school_computer_science",
-    "high_school_european_history",
-    "high_school_geography",
-    "high_school_government_and_politics",
-    "high_school_macroeconomics",
-    "high_school_mathematics",
-    "high_school_microeconomics",
-    "high_school_physics",
-    "high_school_psychology",
-    "high_school_statistics",
-    "high_school_us_history",
-    "high_school_world_history",
-    "human_aging",
-    "human_sexuality",
-    "international_law",
-    "jurisprudence",
-    "logical_fallacies",
-    "machine_learning",
-    "management",
-    "marketing",
-    "medical_genetics",
-    "miscellaneous",
-    "moral_disputes",
-    "moral_scenarios",
-    "nutrition",
-    "philosophy",
-    "prehistory",
-    "professional_accounting",
-    "professional_law",
-    "professional_medicine",
-    "professional_psychology",
-    "public_relations",
-    "security_studies",
-    "sociology",
-    "us_foreign_policy",
-    "virology",
-    "world_religions",
-]
+SUBJECTS = {
+    "abstract_algebra": "stem",
+    "anatomy": "stem",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
 
 
 def parse_args():
@@ -77,6 +77,7 @@ def parse_args():
     parser.add_argument("--save_prefix_path", default="flan")
     parser.add_argument("--cot_prompt_path", default=None)
     parser.add_argument("--task_prefix", default="")
+    parser.add_argument("--group_prefix", default="")
     return parser.parse_args()
 
 
@@ -84,7 +85,7 @@ if __name__ == "__main__":
 
     args = parse_args()
 
-    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
     base_yaml_name = os.path.split(args.base_yaml_path)[-1]
     with open(args.base_yaml_path) as f:
         base_yaml = yaml.full_load(f)
@@ -95,7 +96,12 @@ if __name__ == "__main__":
         with open(args.cot_prompt_path) as f:
             cot_file = json.load(f)
 
-    for subject in tqdm(SUBJECTS):
+    ALL_CATEGORIES = []
+    for subject, category in tqdm(SUBJECTS.items()):
+
+        if category not in ALL_CATEGORIES:
+            ALL_CATEGORIES.append(category)
+
         if args.cot_prompt_path is not None:
             description = cot_file[subject]
         else:
@@ -103,6 +109,7 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
+            "group": f"mmlu_{category}",
             "task": f"mmlu_{args.task_prefix}_{subject}"
             if args.task_prefix != ""
             else f"mmlu_{subject}",
@@ -120,3 +127,18 @@ if __name__ == "__main__":
                 allow_unicode=True,
                 default_style='"',
             )
+
+    if args.group_prefix == "":
+        file_save_path = args.save_prefix_path + ".yaml"
+    else:
+        file_save_path = args.save_prefix_path + f"_{args.group_prefix}.yaml"
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
+    with open(file_save_path, "w") as yaml_file:
+        yaml.dump(
+            {
+                "group": f"mmlu_{args.group_prefix}",
+                "task": [f"mmlu_{category}" for category in ALL_CATEGORIES]
+                },
+            yaml_file,
+            default_flow_style=False
+        )
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index bd989c40..7669f8ce 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/default/mmlu.yaml b/lm_eval/tasks/mmlu/default/mmlu.yaml
new file mode 100644
index 00000000..584de029
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu
+task:
+  - mmlu_stem
+  - mmlu_other
+  - mmlu_social_sciences
+  - mmlu_humanities
diff --git a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
index b6d595d3..bb786cc8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
index 6459cb41..22eaa7fd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_anatomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
index 573dedd7..64f20d20 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_astronomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
index 4b20b795..49330917 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_business_ethics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
index f758e66d..547aeccf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
index f8069007..69826397 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
index e03fbccd..b91c07f6 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
index a9d4a6f2..a89a46aa 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
index f6a86179..c452ff97 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
index 0ea75fb3..d696a40d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_college_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
index 82f13e40..16046e53 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
index e9e06de2..923967ae 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_computer_security"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
index 30ca6efe..88096f3b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
index 680cc507..4c43a5c8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_econometrics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
index 8dd63b33..27ab4828 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
index 4979ee30..bd7106e4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
index 9b73509b..98486ebe 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_formal_logic"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
index 8c43a6c9..9db3f3d3 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_global_facts"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
index 453d3033..0ed8c0e7 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
index 714ee0e5..7aa6037d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
index 9326e259..9cf212af 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
index e212cd22..e9189bd9 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
index a7fffc25..7573c8c2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
index 7255d60f..83d7d498 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
index 29d9ddd7..4e8269b3 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
index 035e7a12..2b9d3216 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
index 72b1c8cf..3206bfdf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
index ef8f6ca5..27a1e51a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
index 5c4cce75..1e0b1628 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
index 20ed42ec..4244ea8f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
index 18cd48da..a6f085ec 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
index b17daac6..c23d93c8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
index 080b2676..1478d3dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_human_aging"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
index ca3389fe..ab56a035 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
index a3d443e0..7a352701 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_international_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
index 4ba00a2a..af29fae3 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
index ea45a4f3..570fd3dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
index 2ba6d162..11166e2f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"group": "mmlu_stem"
 "include": "_default_template_yaml"
 "task": "mmlu_machine_learning"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_management.yaml b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
index b4ea6da9..745ac762 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_management"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
index afa30a0c..38401dc8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_marketing"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
index 92095635..2e4fbbd8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
index 94ebd1b0..aa674180 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
index 8bea0a1f..ac8bbdb9 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
index 71dcc693..33a249c2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
index e6b4cbcd..44b799cd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_nutrition"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
index b9a0b2c5..5a703cc4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_philosophy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
index 7f71bd54..dc8e65b8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_prehistory"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
index 94ca6e6e..c59ccffd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
index 074c34e6..46a3ebbd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
index 2f99c316..fe52278d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
index 01565848..ff7bb1f7 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
index 0d46c66e..290a85f5 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_public_relations"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
index f30dffde..d1a41871 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_security_studies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
index c36bd403..be1e46f5 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_sociology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
index fe8c68d8..f94e8bc0 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"group": "mmlu_social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
index 4cbd0959..4fdc1bf6 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"group": "mmlu_other"
 "include": "_default_template_yaml"
 "task": "mmlu_virology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
index 375efbae..870ea78d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
@@ -1,4 +1,5 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"group": "mmlu_humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_world_religions"
-- 
GitLab


From 1dc8f96f56522625098cf07139acb32f151503eb Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 16 Oct 2023 13:31:10 +0000
Subject: [PATCH 103/212] default to weighted averaging

---
 lm_eval/evaluator.py | 73 +++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index bf35097c..57f9d77c 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,6 +219,7 @@ def evaluate(
     padding_requests = collections.defaultdict(int)
     # store the hierarchy to do proper ordering
     task_hierarchy = collections.defaultdict(list)
+    group_hierarchy = collections.defaultdict(list)
     # store the ordering of tasks and groups
     task_order = collections.defaultdict(int)
     # store the aggregation for aggregating across tasks in the same group
@@ -450,22 +451,26 @@ def evaluate(
 
             agg_fn = task.aggregation()[metric]
             task_score = agg_fn(items)
-
-            if group_name is not None:
-                sample_metric_key = metric + "(sample agg)," + key
-                for grouping in task_to_group[task_name]:
-                    if metric_key in results[grouping]:
-                        results[grouping][metric_key].append(task_score)
-                    else:
-                        results[grouping][metric_key] = [task_score]
-
-                    if sample_metric_key in results[grouping]:
-                        results[grouping][sample_metric_key] += items
-                    else:
-                        results[grouping][sample_metric_key] = items.copy()
-                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
+            task_size = len(items)
+
+            # if group_name is not None:
+            #     sample_metric_key = metric + "(sample agg)," + key
+            #     for grouping in task_to_group[task_name]:
+            #         if metric_key in results[grouping]:
+            #             results[grouping][metric_key].append(task_score)
+            #             results[grouping]["size"].append(task_size)
+            #         else:
+            #             results[grouping][metric_key] = [task_score]
+            #             results[grouping]["size"] = [task_size]
+
+            #         if sample_metric_key in results[grouping]:
+            #             results[grouping][sample_metric_key] += items
+            #         else:
+            #             results[grouping][sample_metric_key] = items.copy()
+            #             sample_agg_fn[grouping][sample_metric_key] = agg_fn
 
             results[task_name][metric_key] = task_score
+            results[task_name]["size"] = task_size
 
             # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
             # so we run them less iterations. still looking for a cleaner way to do this
@@ -481,18 +486,36 @@ def evaluate(
                     results[task_name][metric + "_stderr" + "," + key] = stderr(items)
 
         if bool(results):
-            for task_or_group in results.keys():
-                for metric in results[task_or_group].keys():
-                    if type(results[task_or_group][metric]) == list:
-                        if "(sample agg)" in metric:
-                            results[task_or_group][metric] = sample_agg_fn[
-                                task_or_group
-                            ][metric](results[task_or_group][metric])
+
+            for group, task_list in reversed(task_hierarchy.items()):
+                versions[group] = "N/A"
+                task_score_dict = {}
+                total_size = 0
+                for task in task_list:
+                    metrics = results[task]
+
+                    if "size" in metrics:
+                        current_size = metrics.pop("size")
+                    else:
+                        current_size = 1
+
+                    for metric in [key for key in metrics.keys()]:
+
+                        if "_stderr" in metric:
+                            print(metric)
+
+                        metric_score = results[task][metric]
+
+                        if metric in results[group]:
+                            results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
                         else:
-                            results[task_or_group][metric] = np.average(
-                                results[task_or_group][metric]
-                            )
-                        versions[task_or_group] = "N/A"
+                            results[group][metric] = metric_score
+
+                    # Different formula for agg stderr
+     
+
+                    total_size += current_size
+
 
         for task_name, task in task_dict.items():
             if type(task) == tuple:
-- 
GitLab


From 00859825db00aee796be74292ea869978225f1e9 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 16 Oct 2023 14:26:35 +0000
Subject: [PATCH 104/212] added stderr reprocessing for groups

---
 lm_eval/evaluator.py                          | 49 ++++++-------------
 .../tasks/mmlu/default/_default_template_yaml |  3 --
 2 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 57f9d77c..5d923b1d 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,7 +219,6 @@ def evaluate(
     padding_requests = collections.defaultdict(int)
     # store the hierarchy to do proper ordering
     task_hierarchy = collections.defaultdict(list)
-    group_hierarchy = collections.defaultdict(list)
     # store the ordering of tasks and groups
     task_order = collections.defaultdict(int)
     # store the aggregation for aggregating across tasks in the same group
@@ -450,27 +449,8 @@ def evaluate(
                 group_name = None
 
             agg_fn = task.aggregation()[metric]
-            task_score = agg_fn(items)
-            task_size = len(items)
-
-            # if group_name is not None:
-            #     sample_metric_key = metric + "(sample agg)," + key
-            #     for grouping in task_to_group[task_name]:
-            #         if metric_key in results[grouping]:
-            #             results[grouping][metric_key].append(task_score)
-            #             results[grouping]["size"].append(task_size)
-            #         else:
-            #             results[grouping][metric_key] = [task_score]
-            #             results[grouping]["size"] = [task_size]
-
-            #         if sample_metric_key in results[grouping]:
-            #             results[grouping][sample_metric_key] += items
-            #         else:
-            #             results[grouping][sample_metric_key] = items.copy()
-            #             sample_agg_fn[grouping][sample_metric_key] = agg_fn
-
-            results[task_name][metric_key] = task_score
-            results[task_name]["size"] = task_size
+            results[task_name][metric_key] = agg_fn(items)
+            results[task_name]["samples"] = len(items)
 
             # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
             # so we run them less iterations. still looking for a cleaner way to do this
@@ -494,28 +474,29 @@ def evaluate(
                 for task in task_list:
                     metrics = results[task]
 
-                    if "size" in metrics:
-                        current_size = metrics.pop("size")
-                    else:
-                        current_size = 1
+                    current_size = metrics.pop("samples")
+                    # if "size" in metrics:
+                    #     current_size = metrics.pop("size")
+                    # else:
+                    #     current_size = 1
 
-                    for metric in [key for key in metrics.keys()]:
-
-                        if "_stderr" in metric:
-                            print(metric)
+                    for metric in [key for key in metrics.keys() if "_stderr" not in key]:
 
+                        stderr = "_stderr,".join(metric.split(","))
+                        stderr_score = results[task][stderr]
                         metric_score = results[task][metric]
 
                         if metric in results[group]:
                             results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
+                            # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
+                            results[group][stderr] = ((total_size-1)*results[group][stderr]+(current_size-1)*stderr_score)/(total_size + current_size - 1) \
+                                                        + total_size*current_size/((total_size+current_size)*(total_size+current_size-1))*(results[group][metric] - metric_score)**2
                         else:
                             results[group][metric] = metric_score
-
-                    # Different formula for agg stderr
-     
+                            results[group][stderr] = stderr_score
 
                     total_size += current_size
-
+                results[group]["samples"] = total_size
 
         for task_name, task in task_dict.items():
             if type(task) == tuple:
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index 7669f8ce..5eb1c069 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -11,6 +11,3 @@ metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
-- 
GitLab


From e97019c05b1eaf49d71028adb73eefa2226de769 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 16 Oct 2023 15:24:56 +0000
Subject: [PATCH 105/212] removed comments

---
 lm_eval/evaluator.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 5d923b1d..09ecb090 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -475,9 +475,7 @@ def evaluate(
                     metrics = results[task]
 
                     current_size = metrics.pop("samples")
-                    # if "size" in metrics:
-                    #     current_size = metrics.pop("size")
-                    # else:
+                    # if (group in task_order) and task_order[group] == 0:
                     #     current_size = 1
 
                     for metric in [key for key in metrics.keys() if "_stderr" not in key]:
-- 
GitLab


From 11fdcb4944f791b13f45b6a6fd04c4ceef82ea8c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 16 Oct 2023 15:32:58 +0000
Subject: [PATCH 106/212] update

---
 lm_eval/tasks/squadv2/README.md    | 37 ++++++++++++++++++++++--------
 lm_eval/tasks/squadv2/default.yaml | 14 +----------
 lm_eval/tasks/squadv2/no_ans.yaml  |  5 +---
 3 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/lm_eval/tasks/squadv2/README.md b/lm_eval/tasks/squadv2/README.md
index c13bd21d..f29ad171 100644
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -2,25 +2,44 @@
 
 ### Paper
 
-Title: `paper title goes here`
-Abstract: `link to paper PDF or arXiv abstract goes here`
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
+Abstract: https://arxiv.org/abs/1806.03822
 
-`Short description of paper / benchmark goes here:`
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
 
-Homepage: `homepage to the benchmark's website goes here, if applicable`
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 
 
 ### Citation
 
 ```
-BibTeX-formatted citation goes here
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 ```
 
-### Subtasks
+### Groups and Tasks
 
-List or describe tasks defined in this folder, and their names here:
-* `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+#### Groups
+
+* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+
+#### Tasks
+
+* `squadv2`: `Default squadv2 task`
+* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
 
 ### Checklist
 
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
index 2bb3029f..51a304fc 100644
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
@@ -1,21 +1,9 @@
+include: _template_yaml
 task: squadv2
-dataset_path: squad_v2
 output_type: greedy_until
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
 generation_kwargs:
   until:
     - "\n"
-# filter_list:
-#   - name: remove_whitespace
-#     filter:
-#       - function: remove_whitespace
-#       - function: take_first
 metric_list:
   - metric: !function utils.exact
     aggregation: mean
diff --git a/lm_eval/tasks/squadv2/no_ans.yaml b/lm_eval/tasks/squadv2/no_ans.yaml
index 82d7c477..7b0a47c7 100644
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
@@ -1,9 +1,6 @@
-include: default.yaml
+include: _template_yaml
 task: squadv2_noans_loglikelihood
-dataset_path: squad_v2
 output_type: loglikelihood
-training_split: train
-validation_split: validation
 doc_to_target: " unanswerable"
 metric_list:
   - metric: perplexity
-- 
GitLab


From 04ca56711786951549aaa29e7c5036ff029ca7f6 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Mon, 16 Oct 2023 12:09:38 -0400
Subject: [PATCH 107/212] Update README.md

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9c6a054f..22afe46e 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,12 @@ Features:
 - Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
-- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
-- Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
+- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
+- Support for local models and benchmarks.
+- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
+
+The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and is used internally by dozens of companies including NVIDIA, Cohere, Booz Allen Hamilton, and Mosaic ML.
+
 
 ## Install
 
-- 
GitLab


From 2ade969312b81d1bac5070793ecc6816c0b02604 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 03:48:49 +0000
Subject: [PATCH 108/212] template yaml

---
 lm_eval/tasks/squadv2/_template.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 lm_eval/tasks/squadv2/_template.yaml

diff --git a/lm_eval/tasks/squadv2/_template.yaml b/lm_eval/tasks/squadv2/_template.yaml
new file mode 100644
index 00000000..05bb724a
--- /dev/null
+++ b/lm_eval/tasks/squadv2/_template.yaml
@@ -0,0 +1,8 @@
+dataset_path: squad_v2
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
-- 
GitLab


From 4ccd2ec68c4d206488c543b7ed945a57ac0531a5 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 03:59:56 +0000
Subject: [PATCH 109/212] sqrt final variance calculation to get stderr

---
 lm_eval/evaluator.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 09ecb090..fb7ae564 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -478,12 +478,15 @@ def evaluate(
                     # if (group in task_order) and task_order[group] == 0:
                     #     current_size = 1
 
+                    all_stderr = []
                     for metric in [key for key in metrics.keys() if "_stderr" not in key]:
 
                         stderr = "_stderr,".join(metric.split(","))
                         stderr_score = results[task][stderr]
                         metric_score = results[task][metric]
 
+                        all_stderr.append(stderr)
+
                         if metric in results[group]:
                             results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
                             # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
@@ -494,6 +497,10 @@ def evaluate(
                             results[group][stderr] = stderr_score
 
                     total_size += current_size
+
+                for stderr in all_stderr:
+                    results[group][stderr] = np.sqrt(results[group][stderr])
+                
                 results[group]["samples"] = total_size
 
         for task_name, task in task_dict.items():
-- 
GitLab


From c4f0bf75a9c1dcf42f3564ca04cc0626ea850aec Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 04:03:25 +0000
Subject: [PATCH 110/212] pre-commit reformat

---
 lm_eval/evaluator.py                    | 27 +++++++++++++++++--------
 lm_eval/tasks/mmlu/_generate_configs.py |  6 +++---
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index fb7ae564..eb263fd8 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -221,8 +221,6 @@ def evaluate(
     task_hierarchy = collections.defaultdict(list)
     # store the ordering of tasks and groups
     task_order = collections.defaultdict(int)
-    # store the aggregation for aggregating across tasks in the same group
-    sample_agg_fn = collections.defaultdict(dict)
 
     # get lists of each type of request
     for task_name, task in task_dict.items():
@@ -469,7 +467,6 @@ def evaluate(
 
             for group, task_list in reversed(task_hierarchy.items()):
                 versions[group] = "N/A"
-                task_score_dict = {}
                 total_size = 0
                 for task in task_list:
                     metrics = results[task]
@@ -479,7 +476,9 @@ def evaluate(
                     #     current_size = 1
 
                     all_stderr = []
-                    for metric in [key for key in metrics.keys() if "_stderr" not in key]:
+                    for metric in [
+                        key for key in metrics.keys() if "_stderr" not in key
+                    ]:
 
                         stderr = "_stderr,".join(metric.split(","))
                         stderr_score = results[task][stderr]
@@ -488,10 +487,22 @@ def evaluate(
                         all_stderr.append(stderr)
 
                         if metric in results[group]:
-                            results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
+                            results[group][metric] = (
+                                results[group][metric] * total_size
+                                + metric_score * current_size
+                            ) / (total_size + current_size)
                             # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
-                            results[group][stderr] = ((total_size-1)*results[group][stderr]+(current_size-1)*stderr_score)/(total_size + current_size - 1) \
-                                                        + total_size*current_size/((total_size+current_size)*(total_size+current_size-1))*(results[group][metric] - metric_score)**2
+                            results[group][stderr] = (
+                                (total_size - 1) * results[group][stderr]
+                                + (current_size - 1) * stderr_score
+                            ) / (
+                                total_size + current_size - 1
+                            ) + total_size * current_size / (
+                                (total_size + current_size)
+                                * (total_size + current_size - 1)
+                            ) * (
+                                results[group][metric] - metric_score
+                            ) ** 2
                         else:
                             results[group][metric] = metric_score
                             results[group][stderr] = stderr_score
@@ -500,7 +511,7 @@ def evaluate(
 
                 for stderr in all_stderr:
                     results[group][stderr] = np.sqrt(results[group][stderr])
-                
+
                 results[group]["samples"] = total_size
 
         for task_name, task in task_dict.items():
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index b34a39ea..ec1366aa 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -137,8 +137,8 @@ if __name__ == "__main__":
         yaml.dump(
             {
                 "group": f"mmlu_{args.group_prefix}",
-                "task": [f"mmlu_{category}" for category in ALL_CATEGORIES]
-                },
+                "task": [f"mmlu_{category}" for category in ALL_CATEGORIES],
+            },
             yaml_file,
-            default_flow_style=False
+            default_flow_style=False,
         )
-- 
GitLab


From 93a45962f73c8cb9277855d399afd6d90d0b0019 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 10:47:01 +0000
Subject: [PATCH 111/212] print tasks in alphabetically

---
 lm_eval/evaluator.py | 137 ++++++++++++++++++++++++++-----------------
 1 file changed, 82 insertions(+), 55 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index eb263fd8..caf84941 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -227,6 +227,7 @@ def evaluate(
         if type(task) == tuple:
             group_name, task = task
             task_hierarchy[group_name].append(task_name)
+            versions[group_name] = "N/A"
         else:
             task_hierarchy[task_name] = []
 
@@ -466,68 +467,94 @@ def evaluate(
         if bool(results):
 
             for group, task_list in reversed(task_hierarchy.items()):
-                versions[group] = "N/A"
-                total_size = 0
-                for task in task_list:
-                    metrics = results[task]
-
-                    current_size = metrics.pop("samples")
-                    # if (group in task_order) and task_order[group] == 0:
-                    #     current_size = 1
-
-                    all_stderr = []
-                    for metric in [
-                        key for key in metrics.keys() if "_stderr" not in key
-                    ]:
-
-                        stderr = "_stderr,".join(metric.split(","))
-                        stderr_score = results[task][stderr]
-                        metric_score = results[task][metric]
-
-                        all_stderr.append(stderr)
-
-                        if metric in results[group]:
-                            results[group][metric] = (
-                                results[group][metric] * total_size
-                                + metric_score * current_size
-                            ) / (total_size + current_size)
-                            # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
-                            results[group][stderr] = (
-                                (total_size - 1) * results[group][stderr]
-                                + (current_size - 1) * stderr_score
-                            ) / (
-                                total_size + current_size - 1
-                            ) + total_size * current_size / (
-                                (total_size + current_size)
-                                * (total_size + current_size - 1)
-                            ) * (
-                                results[group][metric] - metric_score
-                            ) ** 2
-                        else:
-                            results[group][metric] = metric_score
-                            results[group][stderr] = stderr_score
-
-                    total_size += current_size
 
-                for stderr in all_stderr:
-                    results[group][stderr] = np.sqrt(results[group][stderr])
+                if task_list == []:
+                    total_size = results[group]["samples"]
+                else:
+                    total_size = 0
+
+                    for task in task_list:
+                        metrics = results[task]
+
+                        current_size = metrics.pop("samples")
+                        # TODO: There should be a way for users
+                        #       to toggle between weighted and
+                        #       unweighted averaging
+                        # For unweighted averaging, use:
+                        #     current_size = 1
+
+                        all_stderr = []
+                        for metric in [
+                            key for key in metrics.keys() if "_stderr" not in key
+                        ]:
+
+                            stderr = "_stderr,".join(metric.split(","))
+                            stderr_score = results[task][stderr]
+                            metric_score = results[task][metric]
+
+                            all_stderr.append(stderr)
+
+                            if metric in results[group]:
+                                results[group][metric] = (
+                                    results[group][metric] * total_size
+                                    + metric_score * current_size
+                                ) / (total_size + current_size)
+                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
+                                results[group][stderr] = (
+                                    (total_size - 1) * results[group][stderr]
+                                    + (current_size - 1) * stderr_score
+                                ) / (
+                                    total_size + current_size - 1
+                                ) + total_size * current_size / (
+                                    (total_size + current_size)
+                                    * (total_size + current_size - 1)
+                                ) * (
+                                    results[group][metric] - metric_score
+                                ) ** 2
+                            else:
+                                results[group][metric] = metric_score
+                                results[group][stderr] = stderr_score
+
+                        total_size += current_size
+
+                    for stderr in all_stderr:
+                        results[group][stderr] = np.sqrt(results[group][stderr])
 
                 results[group]["samples"] = total_size
 
-        for task_name, task in task_dict.items():
-            if type(task) == tuple:
-                group_name, task = task
+        def print_tasks(task_hierarchy, task_order, task_version):
+
+            results_agg = collections.defaultdict(dict)
+            groups_agg = collections.defaultdict(dict)
+            for group_name, task_list in task_hierarchy.items():
+
                 order = task_order[group_name]
                 tabbed_name = "-" * order + group_name
                 results_agg[tabbed_name] = results[group_name]
-                versions[tabbed_name] = versions[group_name]
-                if order == 0:
-                    groups_agg[group_name] = results[group_name]
-
-            order = task_order[task_name]
-            tabbed_name = "-" * order + task_name
-            results_agg[tabbed_name] = results[task_name]
-            versions[tabbed_name] = versions[task_name]
+                task_version[tabbed_name] = task_version[group_name]
+
+                if (order < max(task_order.values())) and (len(task_list) > 0):
+                    groups_agg[tabbed_name] = results[group_name]
+
+                if task_list != []:
+                    for task in sorted(task_list):
+                        if task in task_hierarchy:
+                            _task_hierarchy = {task: task_hierarchy[task]}
+                        else:
+                            _task_hierarchy = {task: []}
+
+                        _results_agg, _groups_agg, task_version = print_tasks(
+                            _task_hierarchy, task_order, task_version
+                        )
+
+                        results_agg = {**results_agg, **_results_agg}
+                        groups_agg = {**groups_agg, **_groups_agg}
+
+            return results_agg, groups_agg, task_version
+
+        results_agg, groups_agg, versions = print_tasks(
+            task_hierarchy, task_order, versions
+        )
 
         results_dict = {
             "results": dict(results_agg.items()),
-- 
GitLab


From a2c4139b537cdcdac48394b5b5270ad1bdc05d51 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 10:47:52 +0000
Subject: [PATCH 112/212] precommit stuff

---
 lm_eval/tasks/belebele/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/belebele/README.md b/lm_eval/tasks/belebele/README.md
index 7b6ab809..e08e63e8 100644
--- a/lm_eval/tasks/belebele/README.md
+++ b/lm_eval/tasks/belebele/README.md
@@ -13,7 +13,7 @@ Homepage: https://github.com/facebookresearch/belebele
 
 ```bibtex
 @misc{bandarkar2023belebele,
-      title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, 
+      title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
       author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
       year={2023},
       eprint={2308.16884},
-- 
GitLab


From 109ed1c7c0dc3f29e2ea206dba642e595f839ab6 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 14:17:23 +0000
Subject: [PATCH 113/212] added subgroups for other mmlu variants

---
 lm_eval/tasks/mmlu/_generate_configs.py       |  29 +-
 .../mmlu/default/{mmlu.yaml => _mmlu.yaml}    |   0
 .../{ => flan_cot_fewshot}/_cot_prompts.json  |   0
 .../tasks/mmlu/flan_cot_fewshot/_mmlu.yaml    |   6 +
 .../mmlu_abstract_algebra.yaml                |   9 +-
 .../mmlu/flan_cot_fewshot/mmlu_anatomy.yaml   |  82 +++---
 .../mmlu/flan_cot_fewshot/mmlu_astronomy.yaml |  77 ++---
 .../mmlu_business_ethics.yaml                 |  51 ++--
 .../mmlu_clinical_knowledge.yaml              |  93 +++---
 .../mmlu_college_biology.yaml                 |  11 +-
 .../mmlu_college_chemistry.yaml               |  75 ++---
 .../mmlu_college_computer_science.yaml        | 268 ++++++------------
 .../mmlu_college_mathematics.yaml             |  91 +++---
 .../mmlu_college_medicine.yaml                |  86 +++---
 .../mmlu_college_physics.yaml                 | 114 +++-----
 .../mmlu_computer_security.yaml               |  63 ++--
 .../mmlu_conceptual_physics.yaml              |  65 ++---
 .../flan_cot_fewshot/mmlu_econometrics.yaml   | 114 ++++----
 .../mmlu_electrical_engineering.yaml          |  64 ++---
 .../mmlu_elementary_mathematics.yaml          |  37 +--
 .../flan_cot_fewshot/mmlu_formal_logic.yaml   | 106 ++++---
 .../flan_cot_fewshot/mmlu_global_facts.yaml   |  65 ++---
 .../mmlu_high_school_biology.yaml             |  94 +++---
 .../mmlu_high_school_chemistry.yaml           |   9 +-
 .../mmlu_high_school_computer_science.yaml    |  15 +-
 .../mmlu_high_school_european_history.yaml    |  45 +--
 .../mmlu_high_school_geography.yaml           | 100 +++----
 ...u_high_school_government_and_politics.yaml | 110 +++----
 .../mmlu_high_school_macroeconomics.yaml      | 101 +++----
 .../mmlu_high_school_mathematics.yaml         |  11 +-
 .../mmlu_high_school_microeconomics.yaml      | 103 +++----
 .../mmlu_high_school_physics.yaml             |  65 ++---
 .../mmlu_high_school_psychology.yaml          | 120 ++++----
 .../mmlu_high_school_statistics.yaml          | 146 ++++------
 .../mmlu_high_school_us_history.yaml          | 251 ++++++++--------
 .../mmlu_high_school_world_history.yaml       | 148 +++++-----
 .../flan_cot_fewshot/mmlu_human_aging.yaml    |  76 ++---
 .../mmlu_human_sexuality.yaml                 |  95 +++----
 .../mmlu_international_law.yaml               | 134 ++++-----
 .../flan_cot_fewshot/mmlu_jurisprudence.yaml  | 114 +++-----
 .../mmlu_logical_fallacies.yaml               | 116 +++-----
 .../mmlu_machine_learning.yaml                |  91 +++---
 .../flan_cot_fewshot/mmlu_management.yaml     |  87 +++---
 .../mmlu/flan_cot_fewshot/mmlu_marketing.yaml | 106 +++----
 .../mmlu_medical_genetics.yaml                |  98 +++----
 .../flan_cot_fewshot/mmlu_miscellaneous.yaml  |  81 ++----
 .../flan_cot_fewshot/mmlu_moral_disputes.yaml | 121 ++++----
 .../mmlu_moral_scenarios.yaml                 | 110 +++----
 .../mmlu/flan_cot_fewshot/mmlu_nutrition.yaml | 120 ++++----
 .../flan_cot_fewshot/mmlu_philosophy.yaml     |  50 ++--
 .../flan_cot_fewshot/mmlu_prehistory.yaml     | 109 +++----
 .../mmlu_professional_accounting.yaml         |  13 +-
 .../mmlu_professional_law.yaml                |   9 +-
 .../mmlu_professional_medicine.yaml           |  35 +--
 .../mmlu_professional_psychology.yaml         |  59 ++--
 .../mmlu_public_relations.yaml                | 104 +++----
 .../mmlu_security_studies.yaml                |   9 +-
 .../mmlu/flan_cot_fewshot/mmlu_sociology.yaml | 110 +++----
 .../mmlu_us_foreign_policy.yaml               | 106 +++----
 .../mmlu/flan_cot_fewshot/mmlu_virology.yaml  |  86 ++----
 .../mmlu_world_religions.yaml                 |  80 ++----
 .../tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml   |   6 +
 ... => _mmlu_flan_cot_zeroshot_template_yaml} |   0
 .../mmlu_abstract_algebra.yaml                |  14 +-
 .../mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml  |  13 +-
 .../flan_cot_zeroshot/mmlu_astronomy.yaml     |  13 +-
 .../mmlu_business_ethics.yaml                 |  14 +-
 .../mmlu_clinical_knowledge.yaml              |  14 +-
 .../mmlu_college_biology.yaml                 |  14 +-
 .../mmlu_college_chemistry.yaml               |  14 +-
 .../mmlu_college_computer_science.yaml        |  14 +-
 .../mmlu_college_mathematics.yaml             |  14 +-
 .../mmlu_college_medicine.yaml                |  14 +-
 .../mmlu_college_physics.yaml                 |  14 +-
 .../mmlu_computer_security.yaml               |  14 +-
 .../mmlu_conceptual_physics.yaml              |  14 +-
 .../flan_cot_zeroshot/mmlu_econometrics.yaml  |  13 +-
 .../mmlu_electrical_engineering.yaml          |  14 +-
 .../mmlu_elementary_mathematics.yaml          |  14 +-
 .../flan_cot_zeroshot/mmlu_formal_logic.yaml  |  14 +-
 .../flan_cot_zeroshot/mmlu_global_facts.yaml  |  14 +-
 .../mmlu_high_school_biology.yaml             |  14 +-
 .../mmlu_high_school_chemistry.yaml           |  14 +-
 .../mmlu_high_school_computer_science.yaml    |  14 +-
 .../mmlu_high_school_european_history.yaml    |  14 +-
 .../mmlu_high_school_geography.yaml           |  14 +-
 ...u_high_school_government_and_politics.yaml |  14 +-
 .../mmlu_high_school_macroeconomics.yaml      |  14 +-
 .../mmlu_high_school_mathematics.yaml         |  14 +-
 .../mmlu_high_school_microeconomics.yaml      |  14 +-
 .../mmlu_high_school_physics.yaml             |  14 +-
 .../mmlu_high_school_psychology.yaml          |  14 +-
 .../mmlu_high_school_statistics.yaml          |  14 +-
 .../mmlu_high_school_us_history.yaml          |  14 +-
 .../mmlu_high_school_world_history.yaml       |  14 +-
 .../flan_cot_zeroshot/mmlu_human_aging.yaml   |  14 +-
 .../mmlu_human_sexuality.yaml                 |  14 +-
 .../mmlu_international_law.yaml               |  14 +-
 .../flan_cot_zeroshot/mmlu_jurisprudence.yaml |  13 +-
 .../mmlu_logical_fallacies.yaml               |  14 +-
 .../mmlu_machine_learning.yaml                |  14 +-
 .../flan_cot_zeroshot/mmlu_management.yaml    |  13 +-
 .../flan_cot_zeroshot/mmlu_marketing.yaml     |  13 +-
 .../mmlu_medical_genetics.yaml                |  14 +-
 .../flan_cot_zeroshot/mmlu_miscellaneous.yaml |  13 +-
 .../mmlu_moral_disputes.yaml                  |  14 +-
 .../mmlu_moral_scenarios.yaml                 |  14 +-
 .../flan_cot_zeroshot/mmlu_nutrition.yaml     |  13 +-
 .../flan_cot_zeroshot/mmlu_philosophy.yaml    |  13 +-
 .../flan_cot_zeroshot/mmlu_prehistory.yaml    |  13 +-
 .../mmlu_professional_accounting.yaml         |  14 +-
 .../mmlu_professional_law.yaml                |  14 +-
 .../mmlu_professional_medicine.yaml           |  14 +-
 .../mmlu_professional_psychology.yaml         |  14 +-
 .../mmlu_public_relations.yaml                |  14 +-
 .../mmlu_security_studies.yaml                |  14 +-
 .../flan_cot_zeroshot/mmlu_sociology.yaml     |  13 +-
 .../mmlu_us_foreign_policy.yaml               |  14 +-
 .../mmlu/flan_cot_zeroshot/mmlu_virology.yaml |  13 +-
 .../mmlu_world_religions.yaml                 |  14 +-
 .../mmlu/flan_n_shot/generative/_mmlu.yaml    |   6 +
 .../_mmlu_flan_generative_template_yaml       |   0
 .../mmlu_abstract_algebra.yaml}               |   4 +-
 .../mmlu_anatomy.yaml}                        |   4 +-
 .../mmlu_astronomy.yaml}                      |   4 +-
 .../mmlu_business_ethics.yaml}                |   4 +-
 .../mmlu_clinical_knowledge.yaml}             |   4 +-
 .../mmlu_college_biology.yaml}                |   4 +-
 .../mmlu_college_chemistry.yaml}              |   4 +-
 .../mmlu_college_computer_science.yaml}       |   4 +-
 .../mmlu_college_mathematics.yaml}            |   4 +-
 .../mmlu_college_medicine.yaml}               |   4 +-
 .../mmlu_college_physics.yaml}                |   4 +-
 .../mmlu_computer_security.yaml}              |   4 +-
 .../mmlu_conceptual_physics.yaml}             |   4 +-
 .../mmlu_econometrics.yaml}                   |   4 +-
 .../mmlu_electrical_engineering.yaml}         |   4 +-
 .../mmlu_elementary_mathematics.yaml}         |   4 +-
 .../mmlu_formal_logic.yaml}                   |   4 +-
 .../mmlu_global_facts.yaml}                   |   4 +-
 .../mmlu_high_school_biology.yaml}            |   4 +-
 .../mmlu_high_school_chemistry.yaml}          |   4 +-
 .../mmlu_high_school_computer_science.yaml}   |   4 +-
 .../mmlu_high_school_european_history.yaml}   |   4 +-
 .../mmlu_high_school_geography.yaml}          |   4 +-
 ..._high_school_government_and_politics.yaml} |   4 +-
 .../mmlu_high_school_macroeconomics.yaml}     |   4 +-
 .../mmlu_high_school_mathematics.yaml}        |   4 +-
 .../mmlu_high_school_microeconomics.yaml}     |   4 +-
 .../mmlu_high_school_physics.yaml}            |   4 +-
 .../mmlu_high_school_psychology.yaml}         |   4 +-
 .../mmlu_high_school_statistics.yaml}         |   4 +-
 .../mmlu_high_school_us_history.yaml}         |   4 +-
 .../mmlu_high_school_world_history.yaml}      |   4 +-
 .../mmlu_human_aging.yaml}                    |   4 +-
 .../mmlu_human_sexuality.yaml}                |   4 +-
 .../mmlu_international_law.yaml}              |   4 +-
 .../mmlu_jurisprudence.yaml}                  |   4 +-
 .../mmlu_logical_fallacies.yaml}              |   4 +-
 .../mmlu_machine_learning.yaml}               |   4 +-
 .../mmlu_management.yaml}                     |   4 +-
 .../mmlu_marketing.yaml}                      |   4 +-
 .../mmlu_medical_genetics.yaml}               |   4 +-
 .../mmlu_miscellaneous.yaml}                  |   4 +-
 .../mmlu_moral_disputes.yaml}                 |   4 +-
 .../mmlu_moral_scenarios.yaml}                |   4 +-
 .../mmlu_nutrition.yaml}                      |   4 +-
 .../mmlu_philosophy.yaml}                     |   4 +-
 .../mmlu_prehistory.yaml}                     |   4 +-
 .../mmlu_professional_accounting.yaml}        |   4 +-
 .../mmlu_professional_law.yaml}               |   4 +-
 .../mmlu_professional_medicine.yaml}          |   4 +-
 .../mmlu_professional_psychology.yaml}        |   4 +-
 .../mmlu_public_relations.yaml}               |   4 +-
 .../mmlu_security_studies.yaml}               |   4 +-
 .../mmlu_sociology.yaml}                      |   4 +-
 .../mmlu_us_foreign_policy.yaml}              |   4 +-
 .../mmlu_virology.yaml}                       |   4 +-
 .../mmlu_world_religions.yaml}                |   4 +-
 .../mmlu/flan_n_shot/loglikelihood/_mmlu.yaml |   6 +
 .../_mmlu_flan_loglikelihood_template_yaml    |   2 +-
 .../mmlu_abstract_algebra.yaml}               |   6 +-
 .../mmlu_anatomy.yaml}                        |   6 +-
 .../mmlu_astronomy.yaml}                      |   6 +-
 .../mmlu_business_ethics.yaml}                |   6 +-
 .../mmlu_clinical_knowledge.yaml}             |   6 +-
 .../mmlu_college_biology.yaml}                |   6 +-
 .../mmlu_college_chemistry.yaml}              |   6 +-
 .../mmlu_college_computer_science.yaml        |   6 +
 .../mmlu_college_mathematics.yaml}            |   6 +-
 .../mmlu_college_medicine.yaml}               |   6 +-
 .../mmlu_college_physics.yaml}                |   6 +-
 .../mmlu_computer_security.yaml}              |   6 +-
 .../mmlu_conceptual_physics.yaml}             |   6 +-
 .../mmlu_econometrics.yaml}                   |   6 +-
 .../mmlu_electrical_engineering.yaml}         |   6 +-
 .../mmlu_elementary_mathematics.yaml}         |   6 +-
 .../mmlu_formal_logic.yaml}                   |   6 +-
 .../mmlu_global_facts.yaml}                   |   6 +-
 .../mmlu_high_school_biology.yaml}            |   6 +-
 .../mmlu_high_school_chemistry.yaml}          |   6 +-
 .../mmlu_high_school_computer_science.yaml    |   6 +
 .../mmlu_high_school_european_history.yaml    |   6 +
 .../mmlu_high_school_geography.yaml           |   6 +
 ...u_high_school_government_and_politics.yaml |   6 +
 .../mmlu_high_school_macroeconomics.yaml      |   6 +
 .../mmlu_high_school_mathematics.yaml}        |   6 +-
 .../mmlu_high_school_microeconomics.yaml      |   6 +
 .../mmlu_high_school_physics.yaml}            |   6 +-
 .../mmlu_high_school_psychology.yaml          |   6 +
 .../mmlu_high_school_statistics.yaml}         |   6 +-
 .../mmlu_high_school_us_history.yaml          |   6 +
 .../mmlu_high_school_world_history.yaml       |   6 +
 .../mmlu_human_aging.yaml}                    |   6 +-
 .../loglikelihood/mmlu_human_sexuality.yaml   |   6 +
 .../mmlu_international_law.yaml}              |   6 +-
 .../mmlu_jurisprudence.yaml}                  |   6 +-
 .../mmlu_logical_fallacies.yaml}              |   6 +-
 .../mmlu_machine_learning.yaml}               |   6 +-
 .../mmlu_management.yaml}                     |   6 +-
 .../mmlu_marketing.yaml}                      |   6 +-
 .../mmlu_medical_genetics.yaml}               |   6 +-
 .../mmlu_miscellaneous.yaml}                  |   6 +-
 .../mmlu_moral_disputes.yaml}                 |   6 +-
 .../mmlu_moral_scenarios.yaml}                |   6 +-
 .../mmlu_nutrition.yaml}                      |   6 +-
 .../mmlu_philosophy.yaml}                     |   6 +-
 .../mmlu_prehistory.yaml}                     |   6 +-
 .../mmlu_professional_accounting.yaml         |   6 +
 .../mmlu_professional_law.yaml}               |   6 +-
 .../mmlu_professional_medicine.yaml}          |   6 +-
 .../mmlu_professional_psychology.yaml         |   6 +
 .../loglikelihood/mmlu_public_relations.yaml  |   6 +
 .../loglikelihood/mmlu_security_studies.yaml  |   6 +
 .../mmlu_sociology.yaml}                      |   6 +-
 .../loglikelihood/mmlu_us_foreign_policy.yaml |   6 +
 .../mmlu_virology.yaml}                       |   6 +-
 .../mmlu_world_religions.yaml}                |   6 +-
 .../mmlu_log_college_computer_science.yaml    |   4 -
 ...mmlu_log_high_school_computer_science.yaml |   4 -
 ...mmlu_log_high_school_european_history.yaml |   4 -
 .../mmlu_log_high_school_geography.yaml       |   4 -
 ...g_high_school_government_and_politics.yaml |   4 -
 .../mmlu_log_high_school_macroeconomics.yaml  |   4 -
 .../mmlu_log_high_school_microeconomics.yaml  |   4 -
 .../mmlu_log_high_school_psychology.yaml      |   4 -
 .../mmlu_log_high_school_us_history.yaml      |   4 -
 .../mmlu_log_high_school_world_history.yaml   |   4 -
 .../flan_n_shot/mmlu_log_human_sexuality.yaml |   4 -
 .../mmlu_log_professional_accounting.yaml     |   4 -
 .../mmlu_log_professional_psychology.yaml     |   4 -
 .../mmlu_log_public_relations.yaml            |   4 -
 .../mmlu_log_security_studies.yaml            |   4 -
 .../mmlu_log_us_foreign_policy.yaml           |   4 -
 254 files changed, 2905 insertions(+), 3478 deletions(-)
 rename lm_eval/tasks/mmlu/default/{mmlu.yaml => _mmlu.yaml} (100%)
 rename lm_eval/tasks/mmlu/{ => flan_cot_fewshot}/_cot_prompts.json (100%)
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
 rename lm_eval/tasks/mmlu/flan_cot_zeroshot/{_mmlu_flan_generative_template_yaml => _mmlu_flan_cot_zeroshot_template_yaml} (100%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{ => generative}/_mmlu_flan_generative_template_yaml (100%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_abstract_algebra.yaml => generative/mmlu_abstract_algebra.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_anatomy.yaml => generative/mmlu_anatomy.yaml} (70%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_astronomy.yaml => generative/mmlu_astronomy.yaml} (70%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_business_ethics.yaml => generative/mmlu_business_ethics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_clinical_knowledge.yaml => generative/mmlu_clinical_knowledge.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_biology.yaml => generative/mmlu_college_biology.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_chemistry.yaml => generative/mmlu_college_chemistry.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_computer_science.yaml => generative/mmlu_college_computer_science.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_mathematics.yaml => generative/mmlu_college_mathematics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_medicine.yaml => generative/mmlu_college_medicine.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_college_physics.yaml => generative/mmlu_college_physics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_computer_security.yaml => generative/mmlu_computer_security.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_conceptual_physics.yaml => generative/mmlu_conceptual_physics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_econometrics.yaml => generative/mmlu_econometrics.yaml} (67%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_electrical_engineering.yaml => generative/mmlu_electrical_engineering.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_elementary_mathematics.yaml => generative/mmlu_elementary_mathematics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_formal_logic.yaml => generative/mmlu_formal_logic.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_global_facts.yaml => generative/mmlu_global_facts.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_biology.yaml => generative/mmlu_high_school_biology.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_chemistry.yaml => generative/mmlu_high_school_chemistry.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_computer_science.yaml => generative/mmlu_high_school_computer_science.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_european_history.yaml => generative/mmlu_high_school_european_history.yaml} (67%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_geography.yaml => generative/mmlu_high_school_geography.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_government_and_politics.yaml => generative/mmlu_high_school_government_and_politics.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_macroeconomics.yaml => generative/mmlu_high_school_macroeconomics.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_mathematics.yaml => generative/mmlu_high_school_mathematics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_microeconomics.yaml => generative/mmlu_high_school_microeconomics.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_physics.yaml => generative/mmlu_high_school_physics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_psychology.yaml => generative/mmlu_high_school_psychology.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_statistics.yaml => generative/mmlu_high_school_statistics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_us_history.yaml => generative/mmlu_high_school_us_history.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_high_school_world_history.yaml => generative/mmlu_high_school_world_history.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_human_aging.yaml => generative/mmlu_human_aging.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_human_sexuality.yaml => generative/mmlu_human_sexuality.yaml} (67%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_international_law.yaml => generative/mmlu_international_law.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_jurisprudence.yaml => generative/mmlu_jurisprudence.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_logical_fallacies.yaml => generative/mmlu_logical_fallacies.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_machine_learning.yaml => generative/mmlu_machine_learning.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_management.yaml => generative/mmlu_management.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_marketing.yaml => generative/mmlu_marketing.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_medical_genetics.yaml => generative/mmlu_medical_genetics.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_miscellaneous.yaml => generative/mmlu_miscellaneous.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_moral_disputes.yaml => generative/mmlu_moral_disputes.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_moral_scenarios.yaml => generative/mmlu_moral_scenarios.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_nutrition.yaml => generative/mmlu_nutrition.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_philosophy.yaml => generative/mmlu_philosophy.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_prehistory.yaml => generative/mmlu_prehistory.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_professional_accounting.yaml => generative/mmlu_professional_accounting.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_professional_law.yaml => generative/mmlu_professional_law.yaml} (68%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_professional_medicine.yaml => generative/mmlu_professional_medicine.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_professional_psychology.yaml => generative/mmlu_professional_psychology.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_public_relations.yaml => generative/mmlu_public_relations.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_security_studies.yaml => generative/mmlu_security_studies.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_sociology.yaml => generative/mmlu_sociology.yaml} (67%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_us_foreign_policy.yaml => generative/mmlu_us_foreign_policy.yaml} (66%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_virology.yaml => generative/mmlu_virology.yaml} (69%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_gen_world_religions.yaml => generative/mmlu_world_religions.yaml} (68%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{ => loglikelihood}/_mmlu_flan_loglikelihood_template_yaml (94%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_abstract_algebra.yaml => loglikelihood/mmlu_abstract_algebra.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_anatomy.yaml => loglikelihood/mmlu_anatomy.yaml} (53%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_astronomy.yaml => loglikelihood/mmlu_astronomy.yaml} (52%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_business_ethics.yaml => loglikelihood/mmlu_business_ethics.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_clinical_knowledge.yaml => loglikelihood/mmlu_clinical_knowledge.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_college_biology.yaml => loglikelihood/mmlu_college_biology.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_college_chemistry.yaml => loglikelihood/mmlu_college_chemistry.yaml} (51%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_college_mathematics.yaml => loglikelihood/mmlu_college_mathematics.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_college_medicine.yaml => loglikelihood/mmlu_college_medicine.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_college_physics.yaml => loglikelihood/mmlu_college_physics.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_computer_security.yaml => loglikelihood/mmlu_computer_security.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_conceptual_physics.yaml => loglikelihood/mmlu_conceptual_physics.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_econometrics.yaml => loglikelihood/mmlu_econometrics.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_electrical_engineering.yaml => loglikelihood/mmlu_electrical_engineering.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_elementary_mathematics.yaml => loglikelihood/mmlu_elementary_mathematics.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_formal_logic.yaml => loglikelihood/mmlu_formal_logic.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_global_facts.yaml => loglikelihood/mmlu_global_facts.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_high_school_biology.yaml => loglikelihood/mmlu_high_school_biology.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_high_school_chemistry.yaml => loglikelihood/mmlu_high_school_chemistry.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_high_school_mathematics.yaml => loglikelihood/mmlu_high_school_mathematics.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_high_school_physics.yaml => loglikelihood/mmlu_high_school_physics.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_high_school_statistics.yaml => loglikelihood/mmlu_high_school_statistics.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_human_aging.yaml => loglikelihood/mmlu_human_aging.yaml} (51%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_international_law.yaml => loglikelihood/mmlu_international_law.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_jurisprudence.yaml => loglikelihood/mmlu_jurisprudence.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_logical_fallacies.yaml => loglikelihood/mmlu_logical_fallacies.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_machine_learning.yaml => loglikelihood/mmlu_machine_learning.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_management.yaml => loglikelihood/mmlu_management.yaml} (52%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_marketing.yaml => loglikelihood/mmlu_marketing.yaml} (52%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_medical_genetics.yaml => loglikelihood/mmlu_medical_genetics.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_miscellaneous.yaml => loglikelihood/mmlu_miscellaneous.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_moral_disputes.yaml => loglikelihood/mmlu_moral_disputes.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_moral_scenarios.yaml => loglikelihood/mmlu_moral_scenarios.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_nutrition.yaml => loglikelihood/mmlu_nutrition.yaml} (52%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_philosophy.yaml => loglikelihood/mmlu_philosophy.yaml} (51%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_prehistory.yaml => loglikelihood/mmlu_prehistory.yaml} (51%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_professional_law.yaml => loglikelihood/mmlu_professional_law.yaml} (50%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_professional_medicine.yaml => loglikelihood/mmlu_professional_medicine.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_sociology.yaml => loglikelihood/mmlu_sociology.yaml} (50%)
 create mode 100644 lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_virology.yaml => loglikelihood/mmlu_virology.yaml} (52%)
 rename lm_eval/tasks/mmlu/flan_n_shot/{mmlu_log_world_religions.yaml => loglikelihood/mmlu_world_religions.yaml} (50%)
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
 delete mode 100644 lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml

diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index ec1366aa..1ea16ece 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -74,7 +74,7 @@ SUBJECTS = {
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="flan")
+    parser.add_argument("--save_prefix_path", default="mmlu")
     parser.add_argument("--cot_prompt_path", default=None)
     parser.add_argument("--task_prefix", default="")
     parser.add_argument("--group_prefix", default="")
@@ -109,7 +109,9 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
-            "group": f"mmlu_{category}",
+            "group": f"mmlu_{args.task_prefix}_{category}"
+            if args.task_prefix != ""
+            else f"mmlu_{category}",
             "task": f"mmlu_{args.task_prefix}_{subject}"
             if args.task_prefix != ""
             else f"mmlu_{subject}",
@@ -123,22 +125,33 @@ if __name__ == "__main__":
             yaml.dump(
                 yaml_dict,
                 yaml_file,
-                width=float("inf"),
+                # width=float("inf"),
                 allow_unicode=True,
                 default_style='"',
             )
 
-    if args.group_prefix == "":
-        file_save_path = args.save_prefix_path + ".yaml"
+    if args.task_prefix != "":
+        mmlu_subcategories = [
+            f"mmlu_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
+        ]
+    else:
+        mmlu_subcategories = [f"mmlu_{category}" for category in ALL_CATEGORIES]
+
+    if args.group_prefix != "":
+        file_save_path = args.group_prefix + ".yaml"
     else:
-        file_save_path = args.save_prefix_path + f"_{args.group_prefix}.yaml"
+        file_save_path = args.save_prefix_path + ".yaml"
+
     eval_logger.info(f"Saving benchmark config to {file_save_path}")
     with open(file_save_path, "w") as yaml_file:
         yaml.dump(
             {
-                "group": f"mmlu_{args.group_prefix}",
-                "task": [f"mmlu_{category}" for category in ALL_CATEGORIES],
+                "group": f"mmlu_{args.task_prefix}"
+                if args.task_prefix != ""
+                else "mmlu",
+                "task": mmlu_subcategories,
             },
             yaml_file,
+            indent=4,
             default_flow_style=False,
         )
diff --git a/lm_eval/tasks/mmlu/default/mmlu.yaml b/lm_eval/tasks/mmlu/default/_mmlu.yaml
similarity index 100%
rename from lm_eval/tasks/mmlu/default/mmlu.yaml
rename to lm_eval/tasks/mmlu/default/_mmlu.yaml
diff --git a/lm_eval/tasks/mmlu/_cot_prompts.json b/lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
similarity index 100%
rename from lm_eval/tasks/mmlu/_cot_prompts.json
rename to lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
new file mode 100644
index 00000000..cb43b048
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_cot_fewshot
+task:
+  - mmlu_flan_cot_fewshot_stem
+  - mmlu_flan_cot_fewshot_other
+  - mmlu_flan_cot_fewshot_social_sciences
+  - mmlu_flan_cot_fewshot_humanities
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
index 5c549591..f9d29bec 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
@@ -1,5 +1,5 @@
-dataset_name: abstract_algebra
-description: "The following are multiple choice questions (with answers) about abstract\
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup\
   \ of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True,\
   \ True (B) False, False (C) True, False (D) False, True\nA: Let's think step by\
@@ -36,5 +36,6 @@ description: "The following are multiple choice questions (with answers) about a
   \ x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2\
   \ has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only\
   \ if c = 1. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_abstract_algebra
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
index 28ca1c4c..144ffbe4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
@@ -1,15 +1,15 @@
-dataset_name: anatomy
-description: "The following are multiple choice questions (with answers) about anatomy.\n\
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \nQ: Which of the following is the body cavity that contains the pituitary gland?\n\
   (A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We\
-  \ refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem\
-  \ step by step. The pituitary gland is the major endocrine gland attached to the\
-  \ base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\
-  \nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n\
+  \ refer to Wikipedia articles on anatomy for help. Let’s solve this problem step\
+  \ by step. The pituitary gland is the major endocrine gland attached to the base\
+  \ of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\n\
+  Q: Which of these branches of the trigeminal nerve contain somatic motor processes?\n\
   (A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None\
   \ of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
-  \ for help. Let\u2019s solve this problem step by step. \nWe know the following:\
-  \ (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch\
+  \ for help. Let’s solve this problem step by step. \nWe know the following: (A)\
+  \ The supraorbital nerve (also known as the frontal nerve) is the largest branch\
   \ of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve.\
   \ (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal\
   \ nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal\
@@ -19,39 +19,39 @@ description: "The following are multiple choice questions (with answers) about a
   (A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper\
   \ central incisors. (C) excess overjet of the upper lateral incisors. (D) excess\
   \ overjet of the upper central incisors.\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step\
-  \ by step. This is a question related to anatomy and orthodontics. Excess overjet\
-  \ is associated with Class II occlusions; therefore, we can safely eliminate (B)\
-  \ from the list, as negative overjet is often associated with Class III occlusions.\
-  \ Now, we need to determine the location of the excess overjet, and that would be\
-  \ the upper (maxillary) lateral incisors. Only (C) has the correct information.\
-  \ The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are\
-  \ separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory\
-  \ epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
-  \ for help. Let\u2019s solve this problem step by step. First, recall that the pleura\
-  \ refers to the thin layer of tissue that covers the lungs and lines the interior\
-  \ wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201C\
-  The pleura have no sensory innervation.\u201D This information is not correct. The\
-  \ pleura do have a sensory innervation.\nOption (B): \u201CThe pleura are separated\
-  \ by a 2 mm space.\u201D This information is not correct. There is a very thin \u201C\
-  potential\u201D space between the layers of the pleura; however, it is typically\
-  \ filled with serous pleural fluid. \nOption (C): \u201CThe pleura extend into the\
-  \ neck.\u201D This information is actuakky true. The cervical pleura, also known\
-  \ as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural\
-  \ cavity into the neck.\nOption (D): \u201CThe pleura are composed of respiratory\
-  \ epithelium.\u201D This information is not correct. The pleaura are composed of\
-  \ connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the\
-  \ only correct answer. The answer is (C).\n\nQ: What is the embryological origin\
+  \ to Wikipedia articles on anatomy for help. Let’s solve this problem step by step.\
+  \ This is a question related to anatomy and orthodontics. Excess overjet is associated\
+  \ with Class II occlusions; therefore, we can safely eliminate (B) from the list,\
+  \ as negative overjet is often associated with Class III occlusions. Now, we need\
+  \ to determine the location of the excess overjet, and that would be the upper (maxillary)\
+  \ lateral incisors. Only (C) has the correct information. The answer is (C).\n\n\
+  Q: The pleura\n(A) have no sensory innervation. (B) are separated by a 2 mm space.\
+  \ (C) extend into the neck. (D) are composed of respiratory epithelium.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on anatomy for help. Let’s\
+  \ solve this problem step by step. First, recall that the pleura refers to the thin\
+  \ layer of tissue that covers the lungs and lines the interior wall of the chest\
+  \ cavity. Now, let’s look at each option:\nOption (A): “The pleura have no sensory\
+  \ innervation.” This information is not correct. The pleura do have a sensory innervation.\n\
+  Option (B): “The pleura are separated by a 2 mm space.” This information is not\
+  \ correct. There is a very thin “potential” space between the layers of the pleura;\
+  \ however, it is typically filled with serous pleural fluid. \nOption (C): “The\
+  \ pleura extend into the neck.” This information is actuakky true. The cervical\
+  \ pleura, also known as the dome of the pleuradome of the pleura, lines the extendsiton\
+  \ of the pleural cavity into the neck.\nOption (D): “The pleura are composed of\
+  \ respiratory epithelium.” This information is not correct. The pleaura are composed\
+  \ of connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is\
+  \ the only correct answer. The answer is (C).\n\nQ: What is the embryological origin\
   \ of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal\
   \ arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\n\
   A: Let's think step by step. We refer to Wikipedia articles on anatomy for help.\
-  \ Let\u2019s solve this problem step by step. The hyoid bone, which is also known\
-  \ as the hyooid, is a a small U-shaped bone located in the anterior neck. In its\
-  \ resting position, it lies between the ase of the mandible and the third cervical\
-  \ vertebrae. We know that the second and the third pharyngeal arches give rise to\
-  \ the horns of the hyoid bone; therefore, the embryological origin of the hyoid\
-  \ bone are the second and the third pharyngeal arches\u2014this information is covered\
-  \ in the last option (D). Therefore, we conclude that (D) must be the correct answer.\
-  \ The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_anatomy
+  \ Let’s solve this problem step by step. The hyoid bone, which is also known as\
+  \ the hyooid, is a a small U-shaped bone located in the anterior neck. In its resting\
+  \ position, it lies between the ase of the mandible and the third cervical vertebrae.\
+  \ We know that the second and the third pharyngeal arches give rise to the horns\
+  \ of the hyoid bone; therefore, the embryological origin of the hyoid bone are the\
+  \ second and the third pharyngeal arches—this information is covered in the last\
+  \ option (D). Therefore, we conclude that (D) must be the correct answer. The answer\
+  \ is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
index cd50fd55..dc365959 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
@@ -1,5 +1,5 @@
-dataset_name: astronomy
-description: "The following are multiple choice questions (with answers) about astronomy.\n\
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper\
   \ belt; short period comets tend to be in the plane of the solar system just like\
   \ the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random\
@@ -16,39 +16,40 @@ description: "The following are multiple choice questions (with answers) about a
   \ lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D)\
   \ It would be the same no matter where you are.\nA: Let's think step by step. If\
   \ we assume that there is no friction, the force needed to accelerate the truck\
-  \ is by Newton\u2019s second law only dependent on the mass of the truck. Hence\
-  \ (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on\
-  \ Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of\
-  \ your eye has a diameter of 5 mm and you have a telescope with an aperture of 50\
-  \ cm. How much more light can the telescope gather than your eye?\n(A) 10000 times\
-  \ more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think\
-  \ step by step. The amount of light is proportional to the aperture area $A = \\\
-  pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between\
-  \ the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2\
-  \ = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid\
-  \ belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic\
-  \ collision. (B) There was not enough material in this part of the solar nebula\
-  \ to form a planet. (C) There was too much rocky material to form a terrestrial\
-  \ planet but not enough gaseous material to form a jovian planet. (D) Resonance\
-  \ with Jupiter prevented material from collecting together to form a planet.\nA:\
-  \ Let's think step by step. The asteroid belt is a stellar disc consisting of a\
-  \ large number of asteroids between Mars and Jupiter's orbits. The asteroids in\
-  \ this belt are affected by the gravitational pull from both other asteroids and\
-  \ nearby planets. Due to the strong gravitational force of Jupiter there are resonances\
-  \ that give rise to low density regions of asteroids known as the Kirkwood gap.\
-  \ So (B) and (C) are not correct since it\u2019s not a lack of material that prevents\
-  \ a planet from being formed, and (A) is incorrect because the Kirkwood gap would\
-  \ have prevented a planet from forming in the first place, and (D) is the correct\
-  \ option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is\
-  \ covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere\
-  \ scatters more light at bluer wavelengths transmitting mostly red light. (C) Because\
-  \ Mars is covered with ancient lava flows which are red in color. (D) Because flowing\
-  \ water on Mars's surface altered the surface minerals several billion years ago.\n\
-  A: Let's think step by step. Option (B) is not correct because if the red color\
-  \ was caused by the scattering off the atmosphere, then the earth with a much thicker\
-  \ atmosphere would also look red. Options (C) and (D) are not specific enough about\
-  \ why the color of the surface would be red, while (A) is correct because it explains\
-  \ that the surface is red due to the rusted materials on the surface and the red\
-  \ color comes from the rust. So the correct option is (A). The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_astronomy
+  \ is by Newton’s second law only dependent on the mass of the truck. Hence (A),\
+  \ (B) and (C) are incorrect since it doesn’t matter that it’s on Mars, and (D) is\
+  \ the correct answer. The answer is (D).\n\nQ: Say the pupil of your eye has a diameter\
+  \ of 5 mm and you have a telescope with an aperture of 50 cm. How much more light\
+  \ can the telescope gather than your eye?\n(A) 10000 times more (B) 100 times more\
+  \ (C) 1000 times more (D) 10 times more\nA: Let's think step by step. The amount\
+  \ of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with\
+  \ diameter $D$, so the relative amounts of light between the eye with diameter 5mm\
+  \ and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer\
+  \ is (A).\n\nQ: Why isn't there a planet where the asteroid belt is located?\n(A)\
+  \ A planet once formed here but it was broken apart by a catastrophic collision.\
+  \ (B) There was not enough material in this part of the solar nebula to form a planet.\
+  \ (C) There was too much rocky material to form a terrestrial planet but not enough\
+  \ gaseous material to form a jovian planet. (D) Resonance with Jupiter prevented\
+  \ material from collecting together to form a planet.\nA: Let's think step by step.\
+  \ The asteroid belt is a stellar disc consisting of a large number of asteroids\
+  \ between Mars and Jupiter's orbits. The asteroids in this belt are affected by\
+  \ the gravitational pull from both other asteroids and nearby planets. Due to the\
+  \ strong gravitational force of Jupiter there are resonances that give rise to low\
+  \ density regions of asteroids known as the Kirkwood gap. So (B) and (C) are not\
+  \ correct since it’s not a lack of material that prevents a planet from being formed,\
+  \ and (A) is incorrect because the Kirkwood gap would have prevented a planet from\
+  \ forming in the first place, and (D) is the correct option. The answer is (D).\n\
+  \nQ: Why is Mars red?\n(A) Because the surface is covered with heavily oxidized\
+  \ (\"rusted\") minerals. (B) Because the atmosphere scatters more light at bluer\
+  \ wavelengths transmitting mostly red light. (C) Because Mars is covered with ancient\
+  \ lava flows which are red in color. (D) Because flowing water on Mars's surface\
+  \ altered the surface minerals several billion years ago.\nA: Let's think step by\
+  \ step. Option (B) is not correct because if the red color was caused by the scattering\
+  \ off the atmosphere, then the earth with a much thicker atmosphere would also look\
+  \ red. Options (C) and (D) are not specific enough about why the color of the surface\
+  \ would be red, while (A) is correct because it explains that the surface is red\
+  \ due to the rusted materials on the surface and the red color comes from the rust.\
+  \ So the correct option is (A). The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
index 60d939a8..53e6b96d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
@@ -1,5 +1,5 @@
-dataset_name: business_ethics
-description: "The following are multiple choice questions (with answers) about business\
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour\
   \ by companies. The success of such campaigns have been heightened through the use\
   \ of ___________, which allow campaigns to facilitate the company in achieving _________\
@@ -7,12 +7,12 @@ description: "The following are multiple choice questions (with answers) about b
   \ Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain\
   \ technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased\
   \ Sales\nA: Let's think step by step. We refer to Wikipedia articles on business\
-  \ ethics for help. The sentence that best uses the possible options above is \u201C\
-  In contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies.\
+  \ ethics for help. The sentence that best uses the possible options above is “In\
+  \ contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies.\
   \ The success of such campaigns have been heightened through the use of *digital\
   \ technology*, which allow campaigns to facilitate the company in achieving *increased\
-  \ sales*.\u201D The answer is (D).\n\nQ: _______ is the direct attempt to formally\
-  \ or informally manage ethical issues or problems, through specific policies, practices\
+  \ sales*.” The answer is (D).\n\nQ: _______ is the direct attempt to formally or\
+  \ informally manage ethical issues or problems, through specific policies, practices\
   \ and programmes.\n(A) Corporate social responsibility (B) Business ethics management\
   \ (C) Sustainability (D) Environmental management\nA: Let's think step by step.\
   \ We refer to Wikipedia articles on business ethics for help. The direct attempt\
@@ -26,30 +26,31 @@ description: "The following are multiple choice questions (with answers) about b
   \ action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent\
   \ direct action, Instrumental action, Indirect action, Information campaign\nA:\
   \ Let's think step by step. We refer to Wikipedia articles on business ethics for\
-  \ help. The sentence that best uses the possible options above is \u201CThree contrasting\
+  \ help. The sentence that best uses the possible options above is “Three contrasting\
   \ tactics that CSO's can engage in to meet their aims are *indirect action*, which\
   \ typically involves research and communication, *violent direct action*, which\
   \ may involve physically attacking a company's operations or *non-violent direct\
-  \ action*, often involving some form of *boycott*.\u201D The answer is (C).\n\n\
-  Q: To ensure the independence of the non-executive board members, there are a number\
+  \ action*, often involving some form of *boycott*.” The answer is (C).\n\nQ: To\
+  \ ensure the independence of the non-executive board members, there are a number\
   \ of steps which can be taken, which include non-executives being drawn from _______\
   \ the company, being appointed for a _________ time period as well as being appointed\
   \ _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently\
   \ (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\n\
   A: Let's think step by step. We refer to Wikipedia articles on business ethics for\
-  \ help. The sentence that best uses the possible options above is \u201CTo ensure\
-  \ the independence of the non-executive board members, there are a number of steps\
-  \ which can be taken, which include non-executives being draw from *outside* the\
-  \ company, being appointed for a *limited* time period as well as being imported\
-  \ *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging\
-  \ in CSR there are a number of moral arguments relating to: negative _______, the\
-  \ _______that corporations possess and the ________ of business and society.\n(A)\
-  \ Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual\
-  \ dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual\
-  \ dependence\nA: Let's think step by step. We refer to Wikipedia articles on business\
-  \ ethics for help. The sentence that best uses the possible options above is \u201C\
-  Beyond the business case for engaging the CSR there are a number of moral arguments\
-  \ relating to: negative *externalities*, the *power* that corporations possess and\
-  \ the *mutual independence* of business and society. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_business_ethics
+  \ help. The sentence that best uses the possible options above is “To ensure the\
+  \ independence of the non-executive board members, there are a number of steps which\
+  \ can be taken, which include non-executives being draw from *outside* the company,\
+  \ being appointed for a *limited* time period as well as being imported *independently*.\
+  \ The answer is (A).\n\nQ: Beyond the business case for engaging in CSR there are\
+  \ a number of moral arguments relating to: negative _______, the _______that corporations\
+  \ possess and the ________ of business and society.\n(A) Externalities, Power, Independence\
+  \ (B) Publicity, Insubstantial resources, Mutual dependence (C) Publicity, Power,\
+  \ Independence (D) Externalities, Power, Mutual dependence\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on business ethics for help. The sentence\
+  \ that best uses the possible options above is “Beyond the business case for engaging\
+  \ the CSR there are a number of moral arguments relating to: negative *externalities*,\
+  \ the *power* that corporations possess and the *mutual independence* of business\
+  \ and society. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
index a591a7bd..6f73eda0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
@@ -1,58 +1,35 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
-
-
-  Q: Glycolysis is the name given to the pathway involving the conversion of:
-
-  (A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen
-  or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl
-  CoA.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. Glycolysis is the name given to the pathway involving conversion of glycogen
-  or glucose to pyruvate or lactate. The answer is (C).
-
-
-  Q: What is the difference between a male and a female catheter?
-
-  (A) Male and female catheters are different colours. (B) Male catheters are longer
-  than female catheters. (C) Male catheters are bigger than female catheters. (D)
-  Female catheters are longer than male catheters.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. The difference between a male and female catheter is that male catheters
-  tend to be longer than female catheters. The answer is (B).
-
-
-  Q: How many attempts should you make to cannulate a patient before passing the job
-  on to a senior colleague, according to the medical knowledge of 2020?
-
-  (A) 4 (B) 3 (C) 2 (D) 1
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. According to the medical protocol as of 2020, you should make two attempts
-  to cannulate a patient before passing the job on to a more-senior practitioner.
-  The answer is (C).
-
-
-  Q: In the assessment of the hand function which of the following is true?
-
-  (A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb
-  by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied
-  by the median nerve (D) Finger abduction is mediated by the palmar interossei
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. Of all the options, it is only true that the opposition of the thumb by
-  opponens pollicis is supplied by spinal root T1. The answer is (B).
-
-
-  Q: The energy for all forms of muscle contraction is provided by:
-
-  (A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate),
-  which is the powerhouse of the cell. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_clinical_knowledge
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\nQ: Glycolysis is the name given to the pathway involving the conversion\
+  \ of:\n(A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose.\
+  \ (C) glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate\
+  \ or acetyl CoA.\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ clinical knowledge for help. Glycolysis is the name given to the pathway involving\
+  \ conversion of glycogen or glucose to pyruvate or lactate. The answer is (C).\n\
+  \nQ: What is the difference between a male and a female catheter?\n(A) Male and\
+  \ female catheters are different colours. (B) Male catheters are longer than female\
+  \ catheters. (C) Male catheters are bigger than female catheters. (D) Female catheters\
+  \ are longer than male catheters.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. The difference between a male and female\
+  \ catheter is that male catheters tend to be longer than female catheters. The answer\
+  \ is (B).\n\nQ: How many attempts should you make to cannulate a patient before\
+  \ passing the job on to a senior colleague, according to the medical knowledge of\
+  \ 2020?\n(A) 4 (B) 3 (C) 2 (D) 1\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. According to the medical protocol as\
+  \ of 2020, you should make two attempts to cannulate a patient before passing the\
+  \ job on to a more-senior practitioner. The answer is (C).\n\nQ: In the assessment\
+  \ of the hand function which of the following is true?\n(A) Abduction of the thumb\
+  \ is supplied by spinal root T2 (B) Opposition of the thumb by opponens policis\
+  \ is supplied by spinal root T1 (C) Finger adduction is supplied by the median nerve\
+  \ (D) Finger abduction is mediated by the palmar interossei\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on clinical knowledge for help. Of all\
+  \ the options, it is only true that the opposition of the thumb by opponens pollicis\
+  \ is supplied by spinal root T1. The answer is (B).\n\nQ: The energy for all forms\
+  \ of muscle contraction is provided by:\n(A) ATP. (B) ADP. (C) phosphocreatine.\
+  \ (D) oxidative phosphorylation.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. The energy for muscular contraction is\
+  \ provided by ATP (adenosine triphosphate), which is the powerhouse of the cell.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
index be51794a..1cd13c56 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
@@ -1,5 +1,5 @@
-dataset_name: college_biology
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\nQ: Which of the following represents an accurate statement concerning\
   \ arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan.\
   \ (B) They possess an open circulatory system with a dorsal heart. (C) They are\
@@ -19,7 +19,7 @@ description: "The following are multiple choice questions (with answers) about c
   \ Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the\
   \ dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$,\
   \ and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and\
-  \ heterozygous individuals, respectively. \u200BThe frequency of the recessive allele\
+  \ heterozygous individuals, respectively. ​The frequency of the recessive allele\
   \ (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency\
   \ of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The\
   \ number of heterozygous individuals is equal to the frequency of heterozygous individuals\
@@ -56,5 +56,6 @@ description: "The following are multiple choice questions (with answers) about c
   \ the human and bird forearms, which rules out (D). Humans and birds do belong to\
   \ the same clade - a group of organisms composed of a common ancestor. The answer\
   \ is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_biology
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
index a02c909e..08f002b5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
@@ -1,37 +1,38 @@
-dataset_name: college_chemistry
-description: "The following are multiple choice questions (with answers) about college\
-  \ chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq)\
-  \ + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n\
-  (A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step\
-  \ by step. A molecule that behaves as a base accepts an H+ ion (or proton) from\
-  \ another molecule, whereas a molecule that behaves as an acid donates an H+ ion\
-  \ (or proton) to another molecule. Neither of these is the case for Cl in this reaction,\
-  \ which rules out (A) and (B). A catalyst is a substance that only accelerates a\
-  \ reaction without itself undergoing chemical change, which is not the case here.\
-  \ This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which\
-  \ they donate in the reaction to form 3 HClO2. This is the behavior of a reducing\
-  \ agent, or (D). The answer is (D).\n\nQ: Which of the following statements about\
-  \ the lanthanide elements is NOT true?\n(A) The most common oxidation state for\
-  \ the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination\
-  \ numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate\
-  \ hydrogen. (D) The atomic radii of the lanthanide elements increase across the\
-  \ period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide\
-  \ elements in fact decrease across the period from La to Lu. Options (A), (B), and\
-  \ (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\n\
-  Q: Which of the following lists the hydrides of group-14 elements in order of thermal\
-  \ stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4\
-  \ < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4\
-  \ < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14\
-  \ hydrides decreases as we move from the top of group 14 to the bottom. The order\
-  \ of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in\
-  \ order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4,\
-  \ or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR\
-  \ spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming\
-  \ the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step\
-  \ by step. The electron paramagnetic resonance spectrum will be split by two forms\
-  \ of interactions. The first is the hyperfine interaction with the 13C (nuclear\
-  \ spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will\
-  \ be further split into 4 lines by the interaction with three equivalent 1H nuclei.\
-  \ The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_chemistry
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\nQ: 3 Cl−(aq) + 4 CrO_4^2−(aq) + 23 H+(aq) → 3 HClO2(aq) + 4 Cr3+(aq)\
+  \ + 10 H2O(l). In the reaction shown above, Cl−(aq) behaves as\n(A) an acid (B)\
+  \ a base (C) a catalyst (D) a reducing agent\nA: Let's think step by step. A molecule\
+  \ that behaves as a base accepts an H+ ion (or proton) from another molecule, whereas\
+  \ a molecule that behaves as an acid donates an H+ ion (or proton) to another molecule.\
+  \ Neither of these is the case for Cl in this reaction, which rules out (A) and\
+  \ (B). A catalyst is a substance that only accelerates a reaction without itself\
+  \ undergoing chemical change, which is not the case here. This rules out (C). Instead,\
+  \ the $Cl^{-} molecules carry a negative charge, which they donate in the reaction\
+  \ to form 3 HClO2. This is the behavior of a reducing agent, or (D). The answer\
+  \ is (D).\n\nQ: Which of the following statements about the lanthanide elements\
+  \ is NOT true?\n(A) The most common oxidation state for the lanthanide elements\
+  \ is +3. (B) Lanthanide complexes often have high coordination numbers (> 6). (C)\
+  \ All of the lanthanide elements react with aqueous acid to liberate hydrogen. (D)\
+  \ The atomic radii of the lanthanide elements increase across the period from La\
+  \ to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide elements\
+  \ in fact decrease across the period from La to Lu. Options (A), (B), and (C) are\
+  \ all true. This means that only (D) is NOT true. The answer is (D).\n\nQ: Which\
+  \ of the following lists the hydrides of group-14 elements in order of thermal stability,\
+  \ from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 <\
+  \ CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 <\
+  \ SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14 hydrides\
+  \ decreases as we move from the top of group 14 to the bottom. The order of elements\
+  \ in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing\
+  \ thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The\
+  \ answer is (A).\n\nQ: Predict the number of lines in the EPR spectrum of a solution\
+  \ of 13C-labelled methyl radical (13CH3•), assuming the lines do not overlap.\n\
+  (A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step by step. The electron paramagnetic\
+  \ resonance spectrum will be split by two forms of interactions. The first is the\
+  \ hyperfine interaction with the 13C (nuclear spin $I = \nrac{1}{2}$) which will\
+  \ split the spectrum into 2 lines. This will be further split into 4 lines by the\
+  \ interaction with three equivalent 1H nuclei. The total number of lines is therefore\
+  \ $2 \\cdot 4 = 8$. The answer is (E)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
index 20b398c1..e3a20705 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
@@ -1,189 +1,79 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
-
-
-  Q: Which of the following regular expressions is equivalent to (describes the same
-  set of strings as) (a* + b)*(c + d)?
-
-  (A) a*(c + d)+ b(c + d)
-
-  (B) a*(c + d)* + b(c + d)*
-
-  (C) a*(c + d)+ b*(c + d)
-
-  (D) (a + b)*c +(a + b)*d
-
-  A: Let''s think step by step. We know that:
-
-  1. (X* + Y)* = (X + Y)*
-
-  2. X(Y + Z)? = XY + XZ
-
-  Using equation 1 we can rewrite (a* + b)*(c + d)? as:
-
-  3. (a + b)*(c + d)?
-
-  Using equation 2 we can rewrite equation 3 as:
-
-  (a + b)*c + (a + b)*d The answer is (D).
-
-
-  Q: The Singleton design pattern is used to guarantee that only a single instance
-  of a class may be instantiated. Which of the following is (are) true of this design
-  pattern?
-
-  I. The Singleton class has a static factory method to provide its instance.
-
-  II. The Singleton class can be a subclass of another class.
-
-  III. The Singleton class has a private constructor.
-
-  (A) I only
-
-  (B) II only
-
-  (C) III only
-
-  (D) I, II, and III
-
-  A: Let''s think step by step. Statement I is a correct statement about a Singleton,
-  because a Singleton restricts instantiation to a single, static method. Statement
-  II is also correct, because there is no inherent restriction regarding the inheritance
-  of a Singleton. Statement III is also correct, because a Singletons must be instantiated
-  only once, so its constructor is made private to prevent any construction except
-  via its static factory method.
-
-  Given these facts, statements I, II, and III are all correct. The answer is (D).
-
-
-  Q: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . .
-  . , R7 and supports the following operations:
-
-  ADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)
-
-  MUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)
-
-  An operation normally takes one cycle; however, an operation takes two cycles if
-  it produces a result required by the immediately following operation in an operation
-  sequence.
-
-  Consider the expression AB + ABC + BC, where variables A, B, C are located in registers
-  R0, R1, R2. If the contents of these three registers must not be modified, what
-  is the minimum number of clock cycles required for an operation sequence that computes
-  the value of AB + ABC + BC?
-
-  (A) 5 (B) 6 (C) 7 (D) 8
-
-  A: Let''s think step by step. First, we are given that A is in R0, B is in R1, and
-  C is in R2.
-
-  Next, we can see that we must compute three multiplies (AB, BC, and ABC) and two
-  adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum
-  of five clock cycles.
-
-  Next, we can see that there is no way to avoid at least one pipeline stall when
-  computing our final answer, because to compute our final sum we must wait at least
-  one cycle for the results from the previous stage to be ready. Thus, our minimum
-  number of cycles must be 6.
-
-  We can verify that we can create a solution that requires only six cycles as follows:
-
-  compute AB: MUL R0, R1, R3
-
-  compute BC: MUL R1, R2, R4
-
-  compute ABC: MUL R3, R4, R5
-
-  compute AB + BC: ADD R3, R4, R6
-
-  STALL
-
-  compute AB + ABC + BC: ADD R5, R6, R7
-
-  So there are 6 cycles. The answer is (B).
-
-
-  Q: A compiler generates code for the following assignment statement.
-
-  G := (A + B) * C - (D + E) * F
-
-  The target machine has a single accumulator and a single-address instruction set
-  consisting of instructions load, store, add, subtract, and multiply. For the arithmetic
-  operations, the left operand is taken from the accumulator and the result appears
-  in the accumulator. The smallest possible number of instructions in the resulting
-  code is
-
-  (A) 5 (B) 6 (C) 7 (D) 9
-
-  A: Let''s think step by step. We can compute the final answer with the following
-  sequence of operations:
-
-  1. LOAD D  (accumulator = D)
-
-  2. ADD E  (accumulator = D+E)
-
-  3. MUL F  (accumulator = (D+E)*F)
-
-  4. STORE X (X = (D+E)*F)
-
-  5. LOAD A  (accumulator = A)
-
-  6. ADD B  (accumulator = A+B)
-
-  7. MUL C  (accumulator = (A+B)*C)
-
-  8. SUB X  (accumulator = (A+B)*C - (D+E)*F)
-
-  9. STORE G (G = (A+B)*C - (D+E)*F)
-
-  This sequence takes 9 instructions. The answer is (D).
-
-
-  Q: Consider a computer design in which multiple processors, each with a private
-  cache memory, share global memory using a single bus. This bus is the critical system
-  resource. Each processor can execute one instruction every 500 nanoseconds as long
-  as memory references are satisfied by its local cache. When a cache miss occurs,
-  the processor is delayed for an additional 2,000 nanoseconds. During half of this
-  additional delay, the bus is dedicated to serving the cache miss. During the other
-  half, the processor cannot continue, but the bus is free to service requests from
-  other processors. On average, each instruction requires 2 memory references. On
-  average, cache misses occur on 1 percent of references. What proportion of the capacity
-  of the bus would a single processor consume, ignoring delays due to competition
-  from other processors?
-
-  (A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27
-
-  A: Let''s think step by step. We know that each instruction requires two memory
-  references per instruction, and that there is an average cache miss rate of one
-  percent.
-
-  Thus a given processor has:
-
-  (1 cache miss / 100 references) * (2 references / instruction) =
-
-  (2 cache misses / 100 instructions), so:
-
-  misses_per_instruction = 1 cache miss / 50 instructions.
-
-  Next, we know that each instruction requires 500 nanoseconds when there is no cache
-  miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:
-
-  50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:
-
-  instructions_per_ns = 50 instructions / 27000 nanoseconds.
-
-  Now, we know that each cache miss locks the bus for half of the 2000 nanosecond
-  cache miss delay, or 1000 nanoseconds, so:
-
-  lock_ns_per_miss = 1000 nanoseconds / cache miss.
-
-  Thus we can see that on average a single processor will lock the bus for:
-
-  lock_ns_per_miss * misses_per_instruction * instructions_per_ns =
-
-  (1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions
-  / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer
-  is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_computer_science
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\nQ: Which of the following regular expressions is equivalent\
+  \ to (describes the same set of strings as) (a* + b)*(c + d)?\n(A) a*(c + d)+ b(c\
+  \ + d)\n(B) a*(c + d)* + b(c + d)*\n(C) a*(c + d)+ b*(c + d)\n(D) (a + b)*c +(a\
+  \ + b)*d\nA: Let's think step by step. We know that:\n1. (X* + Y)* = (X + Y)*\n\
+  2. X(Y + Z)? = XY + XZ\nUsing equation 1 we can rewrite (a* + b)*(c + d)? as:\n\
+  3. (a + b)*(c + d)?\nUsing equation 2 we can rewrite equation 3 as:\n(a + b)*c +\
+  \ (a + b)*d The answer is (D).\n\nQ: The Singleton design pattern is used to guarantee\
+  \ that only a single instance of a class may be instantiated. Which of the following\
+  \ is (are) true of this design pattern?\nI. The Singleton class has a static factory\
+  \ method to provide its instance.\nII. The Singleton class can be a subclass of\
+  \ another class.\nIII. The Singleton class has a private constructor.\n(A) I only\n\
+  (B) II only\n(C) III only\n(D) I, II, and III\nA: Let's think step by step. Statement\
+  \ I is a correct statement about a Singleton, because a Singleton restricts instantiation\
+  \ to a single, static method. Statement II is also correct, because there is no\
+  \ inherent restriction regarding the inheritance of a Singleton. Statement III is\
+  \ also correct, because a Singletons must be instantiated only once, so its constructor\
+  \ is made private to prevent any construction except via its static factory method.\n\
+  Given these facts, statements I, II, and III are all correct. The answer is (D).\n\
+  \nQ: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, .\
+  \ . . , R7 and supports the following operations:\nADD Rs1, Rs2, Rd (Add Rs1 to\
+  \ Rs2 and put the sum in Rd)\nMUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the\
+  \ product in Rd)\nAn operation normally takes one cycle; however, an operation takes\
+  \ two cycles if it produces a result required by the immediately following operation\
+  \ in an operation sequence.\nConsider the expression AB + ABC + BC, where variables\
+  \ A, B, C are located in registers R0, R1, R2. If the contents of these three registers\
+  \ must not be modified, what is the minimum number of clock cycles required for\
+  \ an operation sequence that computes the value of AB + ABC + BC?\n(A) 5 (B) 6 (C)\
+  \ 7 (D) 8\nA: Let's think step by step. First, we are given that A is in R0, B is\
+  \ in R1, and C is in R2.\nNext, we can see that we must compute three multiplies\
+  \ (AB, BC, and ABC) and two adds (AB + ABC, (AB + ABC) + BC) to compute our final\
+  \ answer, resulting in a minimum of five clock cycles.\nNext, we can see that there\
+  \ is no way to avoid at least one pipeline stall when computing our final answer,\
+  \ because to compute our final sum we must wait at least one cycle for the results\
+  \ from the previous stage to be ready. Thus, our minimum number of cycles must be\
+  \ 6.\nWe can verify that we can create a solution that requires only six cycles\
+  \ as follows:\ncompute AB: MUL R0, R1, R3\ncompute BC: MUL R1, R2, R4\ncompute ABC:\
+  \ MUL R3, R4, R5\ncompute AB + BC: ADD R3, R4, R6\nSTALL\ncompute AB + ABC + BC:\
+  \ ADD R5, R6, R7\nSo there are 6 cycles. The answer is (B).\n\nQ: A compiler generates\
+  \ code for the following assignment statement.\nG := (A + B) * C - (D + E) * F\n\
+  The target machine has a single accumulator and a single-address instruction set\
+  \ consisting of instructions load, store, add, subtract, and multiply. For the arithmetic\
+  \ operations, the left operand is taken from the accumulator and the result appears\
+  \ in the accumulator. The smallest possible number of instructions in the resulting\
+  \ code is\n(A) 5 (B) 6 (C) 7 (D) 9\nA: Let's think step by step. We can compute\
+  \ the final answer with the following sequence of operations:\n1. LOAD D  (accumulator\
+  \ = D)\n2. ADD E  (accumulator = D+E)\n3. MUL F  (accumulator = (D+E)*F)\n4. STORE\
+  \ X (X = (D+E)*F)\n5. LOAD A  (accumulator = A)\n6. ADD B  (accumulator = A+B)\n\
+  7. MUL C  (accumulator = (A+B)*C)\n8. SUB X  (accumulator = (A+B)*C - (D+E)*F)\n\
+  9. STORE G (G = (A+B)*C - (D+E)*F)\nThis sequence takes 9 instructions. The answer\
+  \ is (D).\n\nQ: Consider a computer design in which multiple processors, each with\
+  \ a private cache memory, share global memory using a single bus. This bus is the\
+  \ critical system resource. Each processor can execute one instruction every 500\
+  \ nanoseconds as long as memory references are satisfied by its local cache. When\
+  \ a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds.\
+  \ During half of this additional delay, the bus is dedicated to serving the cache\
+  \ miss. During the other half, the processor cannot continue, but the bus is free\
+  \ to service requests from other processors. On average, each instruction requires\
+  \ 2 memory references. On average, cache misses occur on 1 percent of references.\
+  \ What proportion of the capacity of the bus would a single processor consume, ignoring\
+  \ delays due to competition from other processors?\n(A) 1/50 (B) 1/27 (C) 1/25 (D)\
+  \ 2/27\nA: Let's think step by step. We know that each instruction requires two\
+  \ memory references per instruction, and that there is an average cache miss rate\
+  \ of one percent.\nThus a given processor has:\n(1 cache miss / 100 references)\
+  \ * (2 references / instruction) =\n(2 cache misses / 100 instructions), so:\nmisses_per_instruction\
+  \ = 1 cache miss / 50 instructions.\nNext, we know that each instruction requires\
+  \ 500 nanoseconds when there is no cache miss, and 500 + 2000 = 2500 nanoseconds\
+  \ when there is a cache miss. Thus:\n50 instructions / (49 * 500) + (1 * 2500) nanoseconds,\
+  \ so:\ninstructions_per_ns = 50 instructions / 27000 nanoseconds.\nNow, we know\
+  \ that each cache miss locks the bus for half of the 2000 nanosecond cache miss\
+  \ delay, or 1000 nanoseconds, so:\nlock_ns_per_miss = 1000 nanoseconds / cache miss.\n\
+  Thus we can see that on average a single processor will lock the bus for:\nlock_ns_per_miss\
+  \ * misses_per_instruction * instructions_per_ns =\n(1000 nanoseconds / cache miss)\
+  \ * (1 cache miss / 50 instructions) * (50 instructions / 27000 nanoseconds) = 1000\
+  \ * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
index 4442f9ed..9d5d975e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
@@ -1,49 +1,50 @@
-dataset_name: college_mathematics
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations\
   \ T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret\
   \ (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST =\
   \ T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think\
-  \ step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x)\
-  \ + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x)\
-  \ + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x)\
-  \ = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11\
-  \ (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that\
-  \ $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer\
-  \ is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must\
-  \ be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of\
-  \ A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two\
-  \ distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\n\
-  A: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence\
-  \ II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and\
-  \ 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ:\
-  \ Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n =\
-  \ 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n\
-  (A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m\
-  \ and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need\
-  \ to look for smaller $m$ in order to make $m + n$ negative. The next solution is\
-  \ $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The\
-  \ answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt\
-  \ dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt\
-  \ per liter of water is sprayed into the tank at a rate of 4 liters per minute.\
-  \ The sprayed solution is continually mixed with the salt solution in the tank,\
-  \ and the mixture flows out of the tank at a rate of 4 liters per minute. If the\
-  \ mixing is instantaneous, how many grams of salt are in the tank after 100 minutes\
-  \ have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step\
-  \ by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt\
-  \ in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably.\
-  \ We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed\
-  \ into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always\
-  \ 100 liters of liquid in the tank, containing $s$ grams of salt. So the density\
-  \ of salt in the tank is $s / 100$ grams per liter. The flow of water out of the\
-  \ tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for\
-  \ all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$,\
-  \ and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in\
-  \ \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\f\
-  rac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that,\
-  \ for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then,\
-  \ for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K\
-  \ e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100\
-  \ / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_mathematics
+  \ step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))’ = p(x) +\
+  \ xp’(x)\\]\nand\n\\[TS(p) = xp’(x).\\]\nHence \\[ST(p) - TS(p) = p(x) + xp’(x)\
+  \ - xp’(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x) = f(x) for all real\
+  \ x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11 (B) 0 (C) 11 (D)\
+  \ 33/2\nA: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$\
+  \ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).\n\n\
+  Q: Let A be a real 2x2 matrix. Which of the following statements must be true?\n\
+  I. All of the entries of A^2 are nonnegative.\nII. The determinant of A^2 is nonnegative.\n\
+  III. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.\n\
+  (A) I only (B) II only (C) III only (D) II and III only\nA: Let's think step by\
+  \ step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence II holds.\nIII is false:\
+  \ as a counterexample take a diagonal matrix with -1 and 1 on the diagonal. Then\
+  \ $A^2$ is the identity matrix. The answer is (B).\n\nQ: Let A be the set of all\
+  \ ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest\
+  \ negative number in the set B = {m + n : (m, n) \\in A}?\n(A) -5 (B) -4 (C) -3\
+  \ (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m and one of the solutions\
+  \ is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$\
+  \ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$.\
+  \ For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).\n\nQ: A\
+  \ tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters\
+  \ of water. A salt solution containing 0.02 grams of salt per liter of water is\
+  \ sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is\
+  \ continually mixed with the salt solution in the tank, and the mixture flows out\
+  \ of the tank at a rate of 4 liters per minute. If the mixing is instantaneous,\
+  \ how many grams of salt are in the tank after 100 minutes have elapsed?\n(A) 2\
+  \ (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step by step. For all $t\
+  \ \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt in the tank at the\
+  \ $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably. We also\
+  \ use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed into\
+  \ the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100\
+  \ liters of liquid in the tank, containing $s$ grams of salt. So the density of\
+  \ salt in the tank is $s / 100$ grams per liter. The flow of water out of the tank\
+  \ therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all\
+  \ $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and\
+  \ so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in \\mathbb{R}$,\n\
+  $$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\frac{d}{d t}\\\
+  left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that, for all\
+  \ $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then, for all\
+  \ $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K e^{-t\
+  \ / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100 / 25}=2+1\
+  \ \\cdot e^{-4}=2+e^{-4}$. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
index 8f3ae14e..68c7f434 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
@@ -1,5 +1,5 @@
-dataset_name: college_medicine
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle\
   \ weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\n\
   A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
@@ -9,44 +9,44 @@ description: "The following are multiple choice questions (with answers) about c
   \ endurance runners have a high proportion of Type I fibres in their leg muscles\
   \ (C) Liver glycogen is important in the maintenance of the blood glucose concentration\
   \ (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s\
-  \ solve this step by step and go over each choice: \n(A) \u201CMuscle glycogen is\
-  \ broken down enzymatically to glucose-1-phosphate\u201D: This is a correct statement.\n\
-  (B) \u201CElite endurance runners have a high proportion of Type I fibres in their\
-  \ leg muscles\u201D: This is a correct statement.\n(C) \u201CLiver glycogen is important\
-  \ in the maintenance of the blood glucose concentration\u201D: This is a correct\
-  \ statement. \n(D) \u201CInsulin promotes glucose uptake by all tissues in the body\u201D\
-  : This is not a correct statement, because insulin promotes glucose uptake by the\
-  \ liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues\
-  \ in the brain and red blood cells are not affected by insulin. The answer is (D).\n\
-  \nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and\
-  \ seals the lid. The pressure is 1.70 atm, and the room temperature is 25\xB0C.\
-  \ Which two variables will both increase the pressure of the system, if all other\
-  \ variables are held constant?\n(A) Increasing temperature, increasing moles of\
-  \ gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing\
-  \ temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on medicine for help. The relevant equation\
-  \ for this is the ideal gas law: PV=nRT. To increase the pressure of the system\
-  \ (P), then either n (number of moles of the gas) or T (temperature) have to increase.\
-  \ The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder\
-  \ is found that has X-linked recessive transmission. Which of the following statements\
-  \ is likely true regarding the pedigree of this disorder?\n(A) All descendants on\
-  \ the maternal side will have the disorder. (B) Females will be approximately twice\
-  \ as affected as males in this family. (C) All daughters of an affected male will\
-  \ be affected. (D) There will be equal distribution of males and females affected.\n\
-  A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
-  \ Let\u2019s solve this step by step. Let's recall first that females have two X\
-  \ chromosomes, while males have one X and one Y chromosome. This is an important\
-  \ fact we need to know before answering this question. \nBecause a male can only\
-  \ pass his only one X chromosome to a daughter, if he is affected by this rare genetic\
-  \ disorder, then we know for sure that he will pass this rare genetic disorder to\
-  \ all his future-born daughters. Therefore, \u201C(C): All daughters of an affected\
-  \ male will be affected\u201D is a correct statement. The answer is (C).\n\nQ: Glucose\
-  \ is transported into the muscle cell:\n(A) via protein transporters called GLUT4.\
-  \ (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic\
-  \ acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles\
-  \ on medicine for help. Glucose (also known as the blood sugar) is the main sugar\
-  \ found in the human body. It is transported into the muscle cell via diffusion\
-  \ through protein transporters called GLUT4. The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_medicine
+  \ step by step. We refer to Wikipedia articles on medicine for help. Let’s solve\
+  \ this step by step and go over each choice: \n(A) “Muscle glycogen is broken down\
+  \ enzymatically to glucose-1-phosphate”: This is a correct statement.\n(B) “Elite\
+  \ endurance runners have a high proportion of Type I fibres in their leg muscles”:\
+  \ This is a correct statement.\n(C) “Liver glycogen is important in the maintenance\
+  \ of the blood glucose concentration”: This is a correct statement. \n(D) “Insulin\
+  \ promotes glucose uptake by all tissues in the body”: This is not a correct statement,\
+  \ because insulin promotes glucose uptake by the liver, adipose tissue, and muscle,\
+  \ but not all tissues. For instance, the tissues in the brain and red blood cells\
+  \ are not affected by insulin. The answer is (D).\n\nQ: A high school science teacher\
+  \ fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70\
+  \ atm, and the room temperature is 25°C. Which two variables will both increase\
+  \ the pressure of the system, if all other variables are held constant?\n(A) Increasing\
+  \ temperature, increasing moles of gas (B) Increasing temperature, increasing volume\
+  \ (C) Decreasing volume, decreasing temperature (D) Decreasing moles of gas, increasing\
+  \ volume\nA: Let's think step by step. We refer to Wikipedia articles on medicine\
+  \ for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase\
+  \ the pressure of the system (P), then either n (number of moles of the gas) or\
+  \ T (temperature) have to increase. The answer is (A).\n\nQ: In a genetic test of\
+  \ a newborn, a rare genetic disorder is found that has X-linked recessive transmission.\
+  \ Which of the following statements is likely true regarding the pedigree of this\
+  \ disorder?\n(A) All descendants on the maternal side will have the disorder. (B)\
+  \ Females will be approximately twice as affected as males in this family. (C) All\
+  \ daughters of an affected male will be affected. (D) There will be equal distribution\
+  \ of males and females affected.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medicine for help. Let’s solve this step by step. Let's recall first\
+  \ that females have two X chromosomes, while males have one X and one Y chromosome.\
+  \ This is an important fact we need to know before answering this question. \nBecause\
+  \ a male can only pass his only one X chromosome to a daughter, if he is affected\
+  \ by this rare genetic disorder, then we know for sure that he will pass this rare\
+  \ genetic disorder to all his future-born daughters. Therefore, “(C): All daughters\
+  \ of an affected male will be affected” is a correct statement. The answer is (C).\n\
+  \nQ: Glucose is transported into the muscle cell:\n(A) via protein transporters\
+  \ called GLUT4. (B) only in the presence of insulin. (C) via hexokinase. (D) via\
+  \ monocarbylic acid transporters.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medicine for help. Glucose (also known as the blood sugar) is the\
+  \ main sugar found in the human body. It is transported into the muscle cell via\
+  \ diffusion through protein transporters called GLUT4. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
index d500a5b8..f4135204 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
@@ -1,70 +1,44 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
-
-
-  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
-  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
-  telescope is
-
-  (A) 4 (B) 5 (C) 6 (D) 20
-
-  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
-  the focus of both lenses must be between the two lenses, and thus the focal lengths
-  of the two lenses must add up to their separation. Since the focal length of one
-  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
-  the ratio of these two focal lengths, or 4. The answer is (A).
-
-
-  Q: The muon decays with a characteristic lifetime of about 10^-6 second into an
-  electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from
-  decaying into an electron and just a single neutrino by the law of conservation
-  of
-
-  (A) charge (B) mass (C) energy and momentum (D) lepton number
-
-  A: Let''s think step by step. Lepton number must be conserved, meaning the total
-  number of leptons minus the number of antileptons. If a muon decays into an electron
-  and a single neutrino, the total lepton number would go from one to two, violating
-  lepton number conservation. The answer is (D).
-
-
-  Q: One end of a Nichrome wire of length 2L and cross-sectional area A is attached
-  to an end of another Nichrome wire of length L and cross- sectional area 2A. If
-  the free end of the longer wire is at an electric potential of 8.0 volts, and the
-  free end of the shorter wire is at an electric potential of 1.0 volt, the potential
-  at the junction of the two wires is most nearly equal to
-
-  (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V
-
-  A: Let''s think step by step. This is a simple voltage divider problem, where the
-  longer wire has a resistance four times that of the shorter end. So the voltage
-  divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0
-  V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).
-
-
-  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
-  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
-  telescope is
-
-  (A) 4 (B) 5 (C) 6 (D) 20
-
-  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
-  the focus of both lenses must be between the two lenses, and thus the focal lengths
-  of the two lenses must add up to their separation. Since the focal length of one
-  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
-  the ratio of these two focal lengths, or 4. The answer is (A).
-
-
-  Q: For which of the following thermodynamic processes is the increase in the internal
-  energy of an ideal gas equal to the heat added to the gas?
-
-  (A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic
-
-  A: Let''s think step by step. Heat added to the gas can go into the gases internal
-  energy or work done against an external force. However, if the volume of the gas
-  container is constant, no work will be done (since work is pressure times change
-  in volume). So, at constant volume, all of the heat goes into the internal energy.
-  The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_physics
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\nQ: A refracting telescope consists of two converging lenses separated\
+  \ by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification\
+  \ of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In\
+  \ a refracting telescope, if both lenses are converging, the focus of both lenses\
+  \ must be between the two lenses, and thus the focal lengths of the two lenses must\
+  \ add up to their separation. Since the focal length of one lens is 20 cm, the focal\
+  \ length of the other must be 80 cm. The magnification is the ratio of these two\
+  \ focal lengths, or 4. The answer is (A).\n\nQ: The muon decays with a characteristic\
+  \ lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron\
+  \ antineutrino. The muon is forbidden from decaying into an electron and just a\
+  \ single neutrino by the law of conservation of\n(A) charge (B) mass (C) energy\
+  \ and momentum (D) lepton number\nA: Let's think step by step. Lepton number must\
+  \ be conserved, meaning the total number of leptons minus the number of antileptons.\
+  \ If a muon decays into an electron and a single neutrino, the total lepton number\
+  \ would go from one to two, violating lepton number conservation. The answer is\
+  \ (D).\n\nQ: One end of a Nichrome wire of length 2L and cross-sectional area A\
+  \ is attached to an end of another Nichrome wire of length L and cross- sectional\
+  \ area 2A. If the free end of the longer wire is at an electric potential of 8.0\
+  \ volts, and the free end of the shorter wire is at an electric potential of 1.0\
+  \ volt, the potential at the junction of the two wires is most nearly equal to\n\
+  (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V\nA: Let's think step by step. This is a\
+  \ simple voltage divider problem, where the longer wire has a resistance four times\
+  \ that of the shorter end. So the voltage divider ratio is 1 / 5, meaning that the\
+  \ potential in the middle is 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is\
+  \ (A).\n\nQ: A refracting telescope consists of two converging lenses separated\
+  \ by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification\
+  \ of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In\
+  \ a refracting telescope, if both lenses are converging, the focus of both lenses\
+  \ must be between the two lenses, and thus the focal lengths of the two lenses must\
+  \ add up to their separation. Since the focal length of one lens is 20 cm, the focal\
+  \ length of the other must be 80 cm. The magnification is the ratio of these two\
+  \ focal lengths, or 4. The answer is (A).\n\nQ: For which of the following thermodynamic\
+  \ processes is the increase in the internal energy of an ideal gas equal to the\
+  \ heat added to the gas?\n(A) Constant temperature (B) Constant volume (C) Constant\
+  \ pressure (D) Adiabatic\nA: Let's think step by step. Heat added to the gas can\
+  \ go into the gases internal energy or work done against an external force. However,\
+  \ if the volume of the gas container is constant, no work will be done (since work\
+  \ is pressure times change in volume). So, at constant volume, all of the heat goes\
+  \ into the internal energy. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
index 08f23169..11d2736a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
@@ -1,35 +1,36 @@
-dataset_name: computer_security
-description: "The following are multiple choice questions (with answers) about computer\
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628\
   \ bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function\
   \ which takes an input and produces a 160-bit (20-byte) hash value, its message\
   \ digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on\
-  \ your system \u2013 so that your system doesn\u2019t run correctly or you can no\
-  \ longer access specific data, or it may even ask for ransom in order to give your\
-  \ access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D)\
-  \ Ransom Trojan\nA: Let's think step by step. The system is asking for trojans,\
-  \ which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What\
-  \ is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish\
-  \ behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities\
-  \ so they can be fixed, rather than exploited (C) Hacking into systems run by those\
-  \ whose ethics you disagree with (D) A slang term for rapid software development,\
-  \ e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a\
-  \ process of detecting vulnerabilities in an application, system, or organization's\
-  \ infrastructure that an attacker can use to exploit an individual or organization.\
-  \ They use this process to prevent cyberattacks and security breaches by lawfully\
-  \ hacking into the systems and looking for weak points. The answer is (B).\n\nQ:\
-  \ The ____________ is anything which your search engine cannot search.\n(A) Haunted\
-  \ web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step.\
-  \ The search engine searches on the Surface Web, which is the portion of the world\
-  \ wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond\
-  \ to an internet concept. The Deep Web is the part of the World Wide Web which is\
-  \ not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n\
-  (A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a\
-  \ read outside bounds of a buffer (D) a format string attack\nA: Let's think step\
-  \ by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL\
-  \ cryptographic software library. Heartbleed resulted from improper input validation\
-  \ (due to a missing bounds check) in the implementation of the TLS heartbeat extension.\
-  \ The vulnerability was classified as a buffer over-read, a situation where more\
-  \ data can be read than should be allowed. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_computer_security
+  \ your system – so that your system doesn’t run correctly or you can no longer access\
+  \ specific data, or it may even ask for ransom in order to give your access.\n(A)\
+  \ IM – Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan\nA:\
+  \ Let's think step by step. The system is asking for trojans, which are for ransom,\
+  \ which means ransom trojan. The answer is (D).\n\nQ: What is ethical hacking?\n\
+  (A) \"Hacking\" ethics so they justify unintended selfish behavior (B) Hacking systems\
+  \ (e.g., during penetration testing) to expose vulnerabilities so they can be fixed,\
+  \ rather than exploited (C) Hacking into systems run by those whose ethics you disagree\
+  \ with (D) A slang term for rapid software development, e.g., as part of hackathons\n\
+  A: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities\
+  \ in an application, system, or organization's infrastructure that an attacker can\
+  \ use to exploit an individual or organization. They use this process to prevent\
+  \ cyberattacks and security breaches by lawfully hacking into the systems and looking\
+  \ for weak points. The answer is (B).\n\nQ: The ____________ is anything which your\
+  \ search engine cannot search.\n(A) Haunted web (B) World Wide Web (C) Surface web\
+  \ (D) Deep Web\nA: Let's think step by step. The search engine searches on the Surface\
+  \ Web, which is the portion of the world wide web which is visible so (B,C) are\
+  \ wrong. The Haunted Web doesn’t correspond to an internet concept. The Deep Web\
+  \ is the part of the World Wide Web which is not indexed. The answer is (D).\n\n\
+  Q: Exploitation of the Heartbleed bug permits\n(A) overwriting cryptographic keys\
+  \ in memory (B) a kind of code injection (C) a read outside bounds of a buffer (D)\
+  \ a format string attack\nA: Let's think step by step. The Heartbleed Bug is a serious\
+  \ vulnerability in the popular OpenSSL cryptographic software library. Heartbleed\
+  \ resulted from improper input validation (due to a missing bounds check) in the\
+  \ implementation of the TLS heartbeat extension. The vulnerability was classified\
+  \ as a buffer over-read, a situation where more data can be read than should be\
+  \ allowed. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
index df845ce8..4866041d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
@@ -1,32 +1,33 @@
-dataset_name: conceptual_physics
-description: "\nThe following are multiple choice questions (with answers) about conceptual\
-  \ physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different\
-  \ frequency (B) deflection (C) interference (D) polarization\nA: Let's think step\
-  \ by step. In a soap bubble film, the light bounces between the two soap-air interfaces\
-  \ many times, interfering with itself constructively or destructively depending\
-  \ on the width of the film. This results in different colors being visible. The\
-  \ answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission,\
-  \ the combined masses of the products after fission are\n(A) less (B) more (C) the\
-  \ same (D) zero\nA: Let's think step by step. Fission releases energy, which comes\
-  \ from the rest mass of its initial nucleus. Thus the mass of the products is less\
-  \ than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things\
-  \ that are equivalent according to the equivalence principle are\n(A) space and\
-  \ time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration.\
-  \ (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence\
-  \ principle states that gravity and acceleration are equivalent. The answer is (C).\n\
-  \nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen\
-  \ (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear\
-  \ binding energy, the mass of an atomic nucleus is less than the sum of individual\
-  \ masses of the free constituent protons and neutrons; this is known as the mass\
-  \ defect. Hydrogen has no mass defect because it has only a single nucleon, so it\
-  \ will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane\
-  \ flies slower when flying into the wind and faster with wind at its back. When\
-  \ launched at right angles to the wind a cross wind its groundspeed compared with\
-  \ flying in still air is\n(A) the same (B) greater (C) less (D) either greater or\
-  \ less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed\
-  \ in the direction of the wind is greater than it would be in the absence of wind,\
-  \ and its direction orthogonal to the wind is the same as it would be in the absence\
-  \ of the wind. The total speed, which is these two components added in quadrature,\
-  \ is thus greater than the speed in still air. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_conceptual_physics
+"dataset_name": "conceptual_physics"
+"description": "\nThe following are multiple choice questions (with answers) about\
+  \ conceptual physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted\
+  \ to a different frequency (B) deflection (C) interference (D) polarization\nA:\
+  \ Let's think step by step. In a soap bubble film, the light bounces between the\
+  \ two soap-air interfaces many times, interfering with itself constructively or\
+  \ destructively depending on the width of the film. This results in different colors\
+  \ being visible. The answer is (C).\n\nQ: Compared with the mass of a uranium atom\
+  \ undergoing fission, the combined masses of the products after fission are\n(A)\
+  \ less (B) more (C) the same (D) zero\nA: Let's think step by step. Fission releases\
+  \ energy, which comes from the rest mass of its initial nucleus. Thus the mass of\
+  \ the products is less than the mass of the reactant uranium nucleus. The answer\
+  \ is (A).\n\nQ: Things that are equivalent according to the equivalence principle\
+  \ are\n(A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity\
+  \ and acceleration. (D) mass and energy.\nA: Let's think step by step. Einstein’s\
+  \ famous equivalence principle states that gravity and acceleration are equivalent.\
+  \ The answer is (C).\n\nQ: Which of these three elements has the most mass per nucleon?\n\
+  (A) Hydrogen (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step.\
+  \ Due to nuclear binding energy, the mass of an atomic nucleus is less than the\
+  \ sum of individual masses of the free constituent protons and neutrons; this is\
+  \ known as the mass defect. Hydrogen has no mass defect because it has only a single\
+  \ nucleon, so it will have the most mass per nucleon. The answer is (A).\n\nQ: A\
+  \ model airplane flies slower when flying into the wind and faster with wind at\
+  \ its back. When launched at right angles to the wind a cross wind its groundspeed\
+  \ compared with flying in still air is\n(A) the same (B) greater (C) less (D) either\
+  \ greater or less depending on wind speed\nA: Let's think step by step. The plane’s\
+  \ speed in the direction of the wind is greater than it would be in the absence\
+  \ of wind, and its direction orthogonal to the wind is the same as it would be in\
+  \ the absence of the wind. The total speed, which is these two components added\
+  \ in quadrature, is thus greater than the speed in still air. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
index 33883f47..c97ae1b2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
@@ -1,63 +1,63 @@
-dataset_name: econometrics
-description: "The following are multiple choice questions (with answers) about econometrics.\n\
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \nQ: Suppose now that a researcher wishes to use information criteria to determine\
   \ the optimal lag length for a VAR. 500 observations are available for the bi-variate\
   \ VAR, and the values of the determinant of the variance-covariance matrix of residuals\
   \ are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What\
   \ is the optimal model order according to Akaike's information criterion?\n(A) 1\
   \ lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to\
-  \ Wikipedia articles on econometrics for help. Let\u2019s solve this problem step\
-  \ by step. First of all, let\u2019s recall that for a given set of data, Akaike's\
-  \ information criterion (AIC) allows us to measure how well a statistical model\
-  \ fits the data; it is an estimator of prediction error. Here in this problem we\
-  \ will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the\
-  \ values of Akaike\u2019s criterion, where ln denotes the natural log function,\
-  \ det the determinant function, k the total number of parameters in total (across\
-  \ both equations), and T the number of observations (which, in this case, is equal\
-  \ to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags,\
-  \ it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate\
-  \ the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500)\
-  \ = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) =\
-  \ ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084)\
-  \ + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36\
-  \ / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the\
-  \ information criterion, the answer should be the one with the lowest value. In\
-  \ this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following\
-  \ AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2\
-  \ + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4\
-  \ (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles\
-  \ on econometrics for help. Let\u2019s solve this problem step by step. If we have\
-  \ a an AR(1) model with the disturbances having zero mean and unit variance, then\
-  \ the unconditional mean of y is equal to the following:\nunconditional mean of\
-  \ y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the\
-  \ intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\n\
-  unconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which\
-  \ is approximately 0.33. That means that the answer should be (D) 0.33. The answer\
-  \ is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity\
-  \ is present in a regression model but ignored?\n(A) It will be biased (B) It will\
-  \ be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be\
-  \ true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics\
-  \ for help. Heteroscedasticity refers to the condition where the variance of the\
-  \ error terms is not constant across multiple observations. If heteroscedasticity\
-  \ is present in a regression model, then the coefficient estimates in the OLS estimator\
-  \ will be not only unbiased and consistent but also inefficient. Because (A) and\
-  \ (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right\
-  \ answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose\
-  \ that a test statistic has associated with it a p-value of 0.08. Which one of the\
-  \ following statements is true?\n(i) If the size of the test were exactly 8%, we\
-  \ would be indifferent between rejecting and not rejecting the null hypothesis\n\
-  (ii) The null would be rejected if a 10% size of test were used\n(iii) The null\
-  \ would not be rejected if a 1% size of test were used\n(iv) The null would be rejected\
-  \ if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only\
-  \ (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019\
-  s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n\
-  (iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii)\
-  \ are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks\
-  \ will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially\
-  \ (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on\
-  \ econometrics for help. This is a formal logic problem about stationally process.\
-  \ For a stationary autoregressive process, shocks will eventually die away. The\
-  \ answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_econometrics
+  \ Wikipedia articles on econometrics for help. Let’s solve this problem step by\
+  \ step. First of all, let’s recall that for a given set of data, Akaike's information\
+  \ criterion (AIC) allows us to measure how well a statistical model fits the data;\
+  \ it is an estimator of prediction error. Here in this problem we will need to use\
+  \ the formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike’s\
+  \ criterion, where ln denotes the natural log function, det the determinant function,\
+  \ k the total number of parameters in total (across both equations), and T the number\
+  \ of observations (which, in this case, is equal to 500). For 1 lag, the number\
+  \ of parameters in total is equal to 6; for 2 lags, it is 10; for 3 lags, it is\
+  \ 14; and for 4 lags, it is 18. Now, let’s calculate the values of the criterion\
+  \ for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500) = ln(0.0336) + (12 / 500)\
+  \ = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) = ln(0.0169) + (20 / 500) =\
+  \ -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084) + (28 / 500) =-4.724\n\
+  (D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause\
+  \ the optimal model order according to AIC minimizes the information criterion,\
+  \ the answer should be the one with the lowest value. In this case, (D) has the\
+  \ lowest value. The answer is (C).\n\nQ: Consider the following AR(1) model with\
+  \ the disturbances having zero mean and unit variance\nyt = 0.2 + 0.4 yt-1 + ut\n\
+  The (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33\n\
+  A: Let's think step by step. We refer to Wikipedia articles on econometrics for\
+  \ help. Let’s solve this problem step by step. If we have a an AR(1) model with\
+  \ the disturbances having zero mean and unit variance, then the unconditional mean\
+  \ of y is equal to the following:\nunconditional mean of y = (the intercept term)\
+  \ / (1 - autoregressive coefficient)\nWe know that the intercept term is 0.2 and\
+  \ the autoregressive coefficient is 0.4; thus, we have:\nunconditional mean of y\
+  \ = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately 0.33.\
+  \ That means that the answer should be (D) 0.33. The answer is (D).\n\nQ: What would\
+  \ be then consequences for the OLS estimator if heteroscedasticity is present in\
+  \ a regression model but ignored?\n(A) It will be biased (B) It will be inconsistent\
+  \ (C) It will be inefficient (D) All of (a), (b) and (c) will be true.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on econometrics for help. Heteroscedasticity\
+  \ refers to the condition where the variance of the error terms is not constant\
+  \ across multiple observations. If heteroscedasticity is present in a regression\
+  \ model, then the coefficient estimates in the OLS estimator will be not only unbiased\
+  \ and consistent but also inefficient. Because (A) and (B) are incorrect choices\
+  \ and (C) is a correct choice, (D) cannot be the right answer. Ultimately, (C) is\
+  \ the only true choice. The answer is (C).\n\nQ: Suppose that a test statistic has\
+  \ associated with it a p-value of 0.08. Which one of the following statements is\
+  \ true?\n(i) If the size of the test were exactly 8%, we would be indifferent between\
+  \ rejecting and not rejecting the null hypothesis\n(ii) The null would be rejected\
+  \ if a 10% size of test were used\n(iii) The null would not be rejected if a 1%\
+  \ size of test were used\n(iv) The null would be rejected if a 5% size of test were\
+  \ used.\n(A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii)\
+  \ only (D) (i), (ii), (iii), and (iv).\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on econometrics for help. Let’s reason about each of the options.\n\
+  (i) is a true statement.\n(ii) is a true statement.\n(iii) is a true statement.\n\
+  (iv) is not a true statement. Thus, (i), (ii), and (iii) are true. The answer is\
+  \ (C).\n\nQ: For a stationary autoregressive process, shocks will\n(A) Eventually\
+  \ die away (B) Persist indefinitely (C) Grow exponentially (D) Never occur\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on econometrics for help. This\
+  \ is a formal logic problem about stationally process. For a stationary autoregressive\
+  \ process, shocks will eventually die away. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
index cdd31ce4..ea7b24a0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
@@ -1,34 +1,34 @@
-dataset_name: electrical_engineering
-description: "\nThe following are multiple choice questions (with answers) about electrical\
-  \ engineering.\n\nQ: A point pole has a strength of 4\u03C0 * 10^-4 weber. The force\
-  \ in newtons on a point pole of 4\u03C0 * 1.5 * 10^-4 weber placed at a distance\
+"dataset_name": "electrical_engineering"
+"description": "\nThe following are multiple choice questions (with answers) about\
+  \ electrical engineering.\n\nQ: A point pole has a strength of 4π * 10^-4 weber.\
+  \ The force in newtons on a point pole of 4π * 1.5 * 10^-4 weber placed at a distance\
   \ of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's\
   \ think step by step. The force between two point poles is given by m_1m_2/(mu_0\
-  \ 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in\
-  \ the question, we calculate that the force is approximately 15 N. The answer is\
-  \ (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30\
-  \ mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density\
-  \ is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's\
-  \ think step by step. The torque on a coil in a uniform magnetic field is given\
-  \ by BANI, where B is the magnetic flux density, A is the area of the coil, N is\
-  \ the number of turns, and I is the current. So we have that I = (Torque)/(BAN),\
-  \ or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch\
-  \ built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1\
-  \ (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset\
-  \ latch; in the case where S=1 and R=1, the circuit has no stable state; instead\
-  \ a race condition will be produced within the circuit, so the device will be in\
-  \ an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ:\
-  \ Two long parallel conductors carry 100 A. If the conductors are separated by 20\
-  \ mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1\
-  \ N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length\
-  \ between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r),\
-  \ where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging\
-  \ in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\n\
-  Q: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03A9\
-  \ and total number of conductors is 100. Find the total resistance\n(A) 200\u03A9\
-  \ (B) 100\u03A9 (C) 50\u03A9 (D) 10\u03A9\nA: Let's think step by step. In lap winding,\
-  \ effectively two resistors are connected in parallel, so the actual resistance\
-  \ of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50\
-  \ Ohms. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_electrical_engineering
+  \ 4 \\pi r^2), in analogy to Coulomb’s law. Plugging in the values given in the\
+  \ question, we calculate that the force is approximately 15 N. The answer is (A).\n\
+  \nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide.\
+  \ The control torque is 240*10-6 N-m on full scale. If magnetic flux density is\
+  \ 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's think\
+  \ step by step. The torque on a coil in a uniform magnetic field is given by BANI,\
+  \ where B is the magnetic flux density, A is the area of the coil, N is the number\
+  \ of turns, and I is the current. So we have that I = (Torque)/(BAN), or 240e-6/(1200e-6\
+  \ * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch built from NOR gates,\
+  \ which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1,\
+  \ R=1\nA: Let's think step by step. An SR latch is a set-reset latch; in the case\
+  \ where S=1 and R=1, the circuit has no stable state; instead a race condition will\
+  \ be produced within the circuit, so the device will be in an undefined state. So\
+  \ S=1, R=1 is an illegal input. The answer is (D).\n\nQ: Two long parallel conductors\
+  \ carry 100 A. If the conductors are separated by 20 mm, the force per meter of\
+  \ length of each conductor will be\n(A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.\n\
+  A: Let's think step by step. The magnetic force-per-length between two current-carrying\
+  \ conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r), where $r$ is the separation\
+  \ distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2,\
+  \ and 20 mm for r, gives 0.1 N. The answer is (B).\n\nQ: In a 2 pole lap winding\
+  \ dc machine , the resistance of one conductor is 2Ω and total number of conductors\
+  \ is 100. Find the total resistance\n(A) 200Ω (B) 100Ω (C) 50Ω (D) 10Ω\nA: Let's\
+  \ think step by step. In lap winding, effectively two resistors are connected in\
+  \ parallel, so the actual resistance of each pair is 1 Ohm. Since we have 50 pairs,\
+  \ we get a total resistance of 50 Ohms. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
index a85f799c..b5567c0c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
@@ -1,5 +1,5 @@
-dataset_name: elementary_mathematics
-description: "The following are multiple choice questions (with answers) about elementary\
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern\
   \ shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n\
   (A) The 10th number in the pattern will be an even number.\n(B) The number pattern\
@@ -22,19 +22,20 @@ description: "The following are multiple choice questions (with answers) about e
   \ the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different\
   \ colors of paint. They have 25 cans of each color in storage. The number of cans\
   \ of paint the store has in storage can be found using the expression below. 107\
-  \ \xD7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B)\
-  \ 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107\
-  \ x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total\
-  \ of 30 players will play basketball at a park. There will be exactly 5 players\
-  \ on each team. Which statement correctly explains how to find the number of teams\
-  \ needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n\
-  (C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\n\
-  A: Let's think step by step. We want to find the number of teams. We know that there\
-  \ are 5 players/team, and 30 players. Thus to get the number of teams we divide\
-  \ players by players/team, so 30 players / 5 players/team = 6 teams. The answer\
-  \ is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n\
-  (B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think\
-  \ step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5\
-  \ x 4). The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_elementary_mathematics
+  \ × 25. How many cans of paint does the store have in storage?\n(A) 749\n(B) 2,675\n\
+  (C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107 x 25 = (100\
+  \ x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total of 30 players\
+  \ will play basketball at a park. There will be exactly 5 players on each team.\
+  \ Which statement correctly explains how to find the number of teams needed?\n(A)\
+  \ Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n(C) Multiply\
+  \ 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\nA: Let's\
+  \ think step by step. We want to find the number of teams. We know that there are\
+  \ 5 players/team, and 30 players. Thus to get the number of teams we divide players\
+  \ by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).\n\
+  \nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n(B) (5 x 5)\
+  \ + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think step by\
+  \ step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5 x 4). The\
+  \ answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
index 5de7486c..40adf465 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
@@ -1,57 +1,53 @@
-dataset_name: formal_logic
-description: "The following are multiple choice questions (with answers) about formal\
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the\
   \ following sentence?\nTurtles live long lives and are happy creatures, unless they\
-  \ are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022\
-  \ (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on formal logic for help. Let\u2019s solve this step by\
-  \ step. Let \u201CL\u201D denote \u201Cliving long\u201D, H \u201Cbeing happy\u201D\
-  , and \u201CI\u201D \u201Cbeing injured\u201D. Now, consider each choice:\n(A) means\
-  \ (living long AND being happy) is equivalent to (being injured). \n(B) means (living\
-  \ long AND being happy) OR (being injured). \n(C) means (living long) AND (being\
-  \ happy OR being injured). \n(D) means (living long) AND (being happy implies being\
-  \ R), but what R denotes is not clear.\nObviously, (B) is the best symbolization\
-  \ of the original sentence. The answer is (B).\n\nQ: Select the best translation\
-  \ into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector;\
-  \ l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh\
-  \ (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. We are told that \u201CBxyx\u201D\
-  \ means \u201Cx borrows y from z\u201D. We can rewrite \u201CGeorge borrows Hector's\
-  \ lawnmower\u201D as \u201CGeorge borrows a lawnmower from Hector\u201D, which can\
-  \ then be translated into predicate logic as \u201CBglh\u201D. The answer \u201C\
-  Bglh\u201D appears in (C); therefore, (C) must be the correct answer. The answer\
-  \ is (C).\n\nQ: \nSelect the best English interpretation of the given arguments\
-  \ in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/\
-  \ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything\
-  \ is a weakling or Georgia plays volleyball. So something plays volleyball. (B)\
-  \ Marina is a dancer. No weakling is a dancer. Everything is either a weakling or\
-  \ plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some\
-  \ weaklings are not dancers. Everything is either a weakling or plays volleyball.\
-  \ So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer.\
-  \ Either everything is a weakling or Georgia plays volleyball. So something plays\
-  \ volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. Let \u201CD\u201D denote \u201C\
-  being a dancer\u201D, \u201Cm\u201D denote \u201CMaria\u201D, \u201Cg\u201D denote\
-  \ \u201CGeorgia\u201D, \u201CW\u201D denote \u201Cweakling\u201D, \u201CA\u201D\
-  \ denote \u201Cplaying volleyball\u201D. Then, we have the following:\n1. Dm \u2192\
-  \ Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling,\
-  \ then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx\
-  \ \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball.\
-  \ So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that\
-  \ some weaklings are not dancers, but the second argument strongly states that no\
-  \ weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important\
-  \ detail about Georgia playing volleyball. Option (D) has all the details presented\
-  \ in the arguments and is the best English interpretation of the arguments. The\
-  \ answer is (D).\n\nQ: Select the best translation into predicate logic: No people\
-  \ drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx)\
-  \ (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. Let \u201CP\u201D denote \u201C\
-  being on Mars\u201D and \u201CD\u201D denote \u201Cdriving on Mars\u201D. Then let\u2019\
-  s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200\
-  x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption\
-  \ (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do\
-  \ not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these\
-  \ options, Option (C) appears to be the best and most meaningful interpretation\
-  \ of the argument \u201CNo people drive on Mars.\u201D The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_formal_logic
+  \ are injured.\n(A) (L • H) ≡ I (B) (L • H) ∨ I (C) L • (H ∨ I) (D) L • (H ⊃ R).\n\
+  A: Let's think step by step. We refer to Wikipedia articles on formal logic for\
+  \ help. Let’s solve this step by step. Let “L” denote “living long”, H “being happy”,\
+  \ and “I” “being injured”. Now, consider each choice:\n(A) means (living long AND\
+  \ being happy) is equivalent to (being injured). \n(B) means (living long AND being\
+  \ happy) OR (being injured). \n(C) means (living long) AND (being happy OR being\
+  \ injured). \n(D) means (living long) AND (being happy implies being R), but what\
+  \ R denotes is not clear.\nObviously, (B) is the best symbolization of the original\
+  \ sentence. The answer is (B).\n\nQ: Select the best translation into predicate\
+  \ logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower;\
+  \ Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh (D) Bghl\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on formal logic for help. Let’s solve\
+  \ this step by step. We are told that “Bxyx” means “x borrows y from z”. We can\
+  \ rewrite “George borrows Hector's lawnmower” as “George borrows a lawnmower from\
+  \ Hector”, which can then be translated into predicate logic as “Bglh”. The answer\
+  \ “Bglh” appears in (C); therefore, (C) must be the correct answer. The answer is\
+  \ (C).\n\nQ: \nSelect the best English interpretation of the given arguments in\
+  \ predicate logic.\nDm\n(∀x)(Wx ⊃ ~Dx). \n(∀x)Wx ∨ Ag\t/ (∃x)Ax\n(A) Marina is a\
+  \ dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia\
+  \ plays volleyball. So something plays volleyball. (B) Marina is a dancer. No weakling\
+  \ is a dancer. Everything is either a weakling or plays volleyball. So something\
+  \ plays volleyball. (C) Marina is a dancer. Some weaklings are not dancers. Everything\
+  \ is either a weakling or plays volleyball. So something plays volleyball. (D) Marina\
+  \ is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia\
+  \ plays volleyball. So something plays volleyball.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on formal logic for help. Let’s solve this step\
+  \ by step. Let “D” denote “being a dancer”, “m” denote “Maria”, “g” denote “Georgia”,\
+  \ “W” denote “weakling”, “A” denote “playing volleyball”. Then, we have the following:\n\
+  1. Dm → Maria is a dance.\n2. (∀x)(Wx ⊃ ~Dx). → For all x, if x is a weakling, then\
+  \ x is not a dancer. In other words, no weakling is a dancer.\n3. (∀x)Wx ∨ Ag\t\
+  / (∃x)Ax → For all x, x is a weakling or Georgia plays volleyball. So there exists\
+  \ an x that plays volleyball. \nOptions (A) and (C) do claim that some weaklings\
+  \ are not dancers, but the second argument strongly states that no weakling is a\
+  \ dancer. Thus, we can eliminate them. Option (B) omits the important detail about\
+  \ Georgia playing volleyball. Option (D) has all the details presented in the arguments\
+  \ and is the best English interpretation of the arguments. The answer is (D).\n\n\
+  Q: Select the best translation into predicate logic: No people drive on Mars.\n\
+  (A) ~Pd (B) (∀x)(Px ∨ ~Dx) (C) (∀x)(Px ⊃ ~Dx) (D) ~Dp\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on formal logic for help. Let’s solve this step\
+  \ by step. Let “P” denote “being on Mars” and “D” denote “driving on Mars”. Then\
+  \ let’s consider each option:\nOption (A): ~Pd → d is not on Mars.\nOption (B):\
+  \ (∀x)(Px ∨ ~Dx) → For all x, x is on Mars and x do not drive on Mars.\nOption (C):\
+  \ (∀x)(Px ⊃ ~Dx) → For all x, x is on Mars implies that x do not drive on Mars.\n\
+  Option (D): ~Dp: → p do not drive on Mars.\nOf all these options, Option (C) appears\
+  \ to be the best and most meaningful interpretation of the argument “No people drive\
+  \ on Mars.” The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
index d5031b00..0287df11 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
@@ -1,33 +1,34 @@
-dataset_name: global_facts
-description: "The following are multiple choice questions (with answers) about global\
-  \ facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today\
-  \ have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\n\
-  A: Let's think step by step. We refer to Wikipedia articles on global facts for\
-  \ help. According to data published by the World Health Organization, the nummber\
-  \ of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\n\
-  Q: As of 2019, about what percentage of Americans agree that the state is run for\
-  \ the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on global facts for help. In 2019,\
-  \ about 46% percentage of Americans agree that the state is run for the benefit\
-  \ of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage\
-  \ of Russians say it is very important to have free media in our country without\
-  \ government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on global facts for help. As of 2019,\
-  \ about 38% of Russians say it is very important to have free media in our country.\
-  \ The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and\
-  \ have ____ in Africa and the Americas.\n(A) increased, increased (B) increased,\
-  \ decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on global facts for help. As of 2015,\
-  \ since 1990 forests have increased in Europe and have decreased in Africa and the\
-  \ Americas. The answer is (B).\n\nQ: Which of the following pairs of statements\
-  \ are both true (as of 2019)?\n(A) People tend to be optimistic about their own\
-  \ future and the future of their nation or the world. (B) People tend to be optimistic\
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\nQ: As of 2017, how many of the world’s 1-year-old children today have\
+  \ been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on global facts for help. According\
+  \ to data published by the World Health Organization, the nummber of 1-year-old\
+  \ children vaccinated in 2017 exceeds 80%. The answer is (A).\n\nQ: As of 2019,\
+  \ about what percentage of Americans agree that the state is run for the benefit\
+  \ of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on global facts for help. In 2019, about 46% percentage\
+  \ of Americans agree that the state is run for the benefit of all the people. The\
+  \ answer is (B).\n\nQ: As of 2019, about what percentage of Russians say it is very\
+  \ important to have free media in our country without government/state censorship?\n\
+  (A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on global facts for help. As of 2019, about 38% of Russians say it is\
+  \ very important to have free media in our country. The answer is (A).\n\nQ: As\
+  \ of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the\
+  \ Americas.\n(A) increased, increased (B) increased, decreased (C) decreased, increased\
+  \ (D) decreased, decreased\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on global facts for help. As of 2015, since 1990 forests have increased in Europe\
+  \ and have decreased in Africa and the Americas. The answer is (B).\n\nQ: Which\
+  \ of the following pairs of statements are both true (as of 2019)?\n(A) People tend\
+  \ to be optimistic about their own future and the future of their nation or the\
+  \ world. (B) People tend to be optimistic about their own future but pessimistic\
+  \ about the future of their nation or the world. (C) People tend to be pessimistic\
+  \ about their own future but optimistic about the future of their nation or the\
+  \ world. (D) People tend to be pessimistic about their own future and the future\
+  \ of their nation or the world.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on global facts for help. As of 2019, most people tend to be optimistic\
   \ about their own future but pessimistic about the future of their nation or the\
-  \ world. (C) People tend to be pessimistic about their own future but optimistic\
-  \ about the future of their nation or the world. (D) People tend to be pessimistic\
-  \ about their own future and the future of their nation or the world.\nA: Let's\
-  \ think step by step. We refer to Wikipedia articles on global facts for help. As\
-  \ of 2019, most people tend to be optimistic about their own future but pessimistic\
-  \ about the future of their nation or the world. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_global_facts
+  \ world. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
index 91295fe8..6573d82c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
@@ -1,54 +1,54 @@
-dataset_name: high_school_biology
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\nQ: In animal cells, which of the following represents the most\
   \ likely pathway that a secretory protein takes as it is synthesized in a cell?\n\
-  (A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013\
-  rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013\
-  plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory\
-  \ vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory\
-  \ vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts\
+  (A) Plasma membrane–Golgi apparatus–ribosome–secretory vesicle–rough ER (B) Ribosome–Golgi\
+  \ apparatus–rough ER–secretory vesicle–plasma membrane (C) Plasma membrane–Golgi\
+  \ apparatus–ribosome–secretory vesicle–rough ER (D) Ribosome–rough ER–Golgi apparatus–secretory\
+  \ vesicle–plasma membrane\nA: Let's think step by step. Protein synthesis starts\
   \ at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the\
   \ endoplasmic reticulum and moves from there to the Golgi apparatus, where it is\
   \ modified and packaged into a vesicle. The vesicle then floats to the plasma membrane\
   \ and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed\
   \ a previously polar amino acid into a nonpolar amino acid. This amino acid was\
-  \ located at a site distant from the enzyme\u2019s active site. How might this mutation\
-  \ alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019\
-  s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing\
-  \ the shape of the protein (D) An amino acid change away from the active site cannot\
-  \ alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A\
-  \ change in an amino acid leads to a change in the primary structure of the protein.\
-  \ A change in the primary structure may lead to a change in the secondary and the\
-  \ tertiary structure of the protein. A change in the tertiary structure means a\
-  \ change in the shape of the protein, so (C) has to be correct. Since the change\
-  \ does not affect the active site of the enzyme, we do not expect the activity of\
-  \ the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is\
-  \ not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized\
-  \ transduction (D) Transformation\nA: Let's think step by step. The introduction\
-  \ of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique\
-  \ in molecular biology and scientific research. There are multiple ways foreign\
-  \ DNA can be introduced into cells including transformation, transduction, conjugation,\
-  \ and transfection. In contrast, (A) is not a way to form DNA: during translation\
-  \ the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous\
-  \ structures are often cited as evidence for the process of natural selection. All\
-  \ of the following are examples of homologous structures EXCEPT\n(A) the wings of\
-  \ a bird and the wings of a bat (B) the flippers of a whale and the arms of a man\
-  \ (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs\
-  \ of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200B\u200B\
-  Homologous structures are similar physical features in organisms that share a common\
-  \ ancestor \u200B\u200Bbut different functions. Comparisons (B) and (C) are clearly\
-  \ homologous because they share a common ancestor and the structures serve different\
-  \ purposes. Bat wings and birg wings are also homologous, while they are both wings,\
-  \ the forelimbs serve different purposes. Insects and dogs are very far ancestors\
-  \ since one is vertebrate while the other is invertebrate and the forelimbs serve\
-  \ the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of\
-  \ the following is not known to be involved in the control of cell division?\n(A)\
-  \ Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think\
-  \ step by step. Normal cells move through the cell cycle in a regulated way. At\
-  \ the checkpoint stage, they use information about their own internal state and\
-  \ cues from the environment around them to decide whether to proceed with cell division.\
-  \ Cues like these act by changing the activity of core cell cycle regulators inside\
-  \ the cell. The most common regulators are cyclins and cyclin-dependent kinases.\
-  \ Fibroblast cells do not play any role in cell division. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_biology
+  \ located at a site distant from the enzyme’s active site. How might this mutation\
+  \ alter the enzyme’s substrate specificity?\n(A) By changing the enzyme’s pH optimum\
+  \ (B) By changing the enzyme’s location in the cell (C) By changing the shape of\
+  \ the protein (D) An amino acid change away from the active site cannot alter the\
+  \ enzyme’s substrate specificity.\nA: Let's think step by step. A change in an amino\
+  \ acid leads to a change in the primary structure of the protein. A change in the\
+  \ primary structure may lead to a change in the secondary and the tertiary structure\
+  \ of the protein. A change in the tertiary structure means a change in the shape\
+  \ of the protein, so (C) has to be correct. Since the change does not affect the\
+  \ active site of the enzyme, we do not expect the activity of the enzyme to be affected.\
+  \ The answer is (C).\n\nQ: Which of the following is not a way to form recombinant\
+  \ DNA?\n(A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation\n\
+  A: Let's think step by step. The introduction of foreign DNA or RNA into bacteria\
+  \ or eukaryotic cells is a common technique in molecular biology and scientific\
+  \ research. There are multiple ways foreign DNA can be introduced into cells including\
+  \ transformation, transduction, conjugation, and transfection. In contrast, (A)\
+  \ is not a way to form DNA: during translation the ribosomes synthesize proteins\
+  \ from RNA. The answer is (A).\n\nQ: Homologous structures are often cited as evidence\
+  \ for the process of natural selection. All of the following are examples of homologous\
+  \ structures EXCEPT\n(A) the wings of a bird and the wings of a bat (B) the flippers\
+  \ of a whale and the arms of a man (C) the pectoral fins of a porpoise and the flippers\
+  \ of a seal (D) the forelegs of an insect and the forelimbs of a dog\nA: Let's think\
+  \ step by step. ​​Homologous structures are similar physical features in organisms\
+  \ that share a common ancestor ​​but different functions. Comparisons (B) and (C)\
+  \ are clearly homologous because they share a common ancestor and the structures\
+  \ serve different purposes. Bat wings and birg wings are also homologous, while\
+  \ they are both wings, the forelimbs serve different purposes. Insects and dogs\
+  \ are very far ancestors since one is vertebrate while the other is invertebrate\
+  \ and the forelimbs serve the same purpose, so they are not homologous. The answer\
+  \ is (D).\n\nQ: Which of the following is not known to be involved in the control\
+  \ of cell division?\n(A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast\
+  \ cells\nA: Let's think step by step. Normal cells move through the cell cycle in\
+  \ a regulated way. At the checkpoint stage, they use information about their own\
+  \ internal state and cues from the environment around them to decide whether to\
+  \ proceed with cell division. Cues like these act by changing the activity of core\
+  \ cell cycle regulators inside the cell. The most common regulators are cyclins\
+  \ and cyclin-dependent kinases. Fibroblast cells do not play any role in cell division.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
index ce2a26cc..577a4866 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_chemistry
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n\
   (A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride\
   \ is a compound that is derived by removing water from an acid. The chemical formula\
@@ -45,5 +45,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ the acetate ion. The added strong acid, Nitric acid, will react with the conjugate\
   \ base. Therefore the maximum amount of acid that can be added will be equal to\
   \ the amount of acetate ion, or 2 moles. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_chemistry
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
index 16a9f66d..6b0e0c8f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_computer_science
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\nQ: Which of the following is an example of the use\
   \ of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that\
   \ it is about to hit an object. (B) A hiker uses a G P S watch to keep track of\
@@ -26,9 +26,9 @@ description: "The following are multiple choice questions (with answers) about h
   \ launched from any web sites visited or files downloaded.\nA: Let's think step\
   \ by step. Choice A is incorrect as it only describes network traffic, which an\
   \ anonymous browser does not change. Choice B is correct as it correctly describes\
-  \ how an anonymous browser will prevent saving data on the user\u2019s computer\
-  \ after the session is ended. Choice C is incorrect because an anonymous browser\
-  \ will not prevent logging in to email or social media accounts. Choice D is incorrect\
+  \ how an anonymous browser will prevent saving data on the user’s computer after\
+  \ the session is ended. Choice C is incorrect because an anonymous browser will\
+  \ not prevent logging in to email or social media accounts. Choice D is incorrect\
   \ because an anonymous browser in itself performs no virus protection. The answer\
   \ is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial\
   \ value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n\
@@ -66,5 +66,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ its value is greater than 100, regardless of the elements in the list. Choice\
   \ D is incorrect because its step 3 does not increment the value of position, so\
   \ it will repeat forever. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_computer_science
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
index 0e7aafcc..ca8ec93f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_european_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\nQ: This question refers to the following information.\n\
   Albeit the king's Majesty justly and rightfully is and ought to be the supreme head\
   \ of the Church of England, and so is recognized by the clergy of this realm in\
@@ -34,7 +34,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ the corruption in the Church of England. The answer is (D).\n\nQ: This question\
   \ refers to the following information.\nRead the following excerpt.\nThe revolutionary\
   \ seed had penetrated into every country and spread more or less. It was greatly\
-  \ developed under the r\xE9gime of the military despotism of Bonaparte. His conquests\
+  \ developed under the régime of the military despotism of Bonaparte. His conquests\
   \ displaced a number of laws, institutions, and customs; broke through bonds sacred\
   \ among all nations, strong enough to resist time itself; which is more than can\
   \ be said of certain benefits conferred by these innovators.\nThe monarchs will\
@@ -55,9 +55,9 @@ description: "The following are multiple choice questions (with answers) about h
   Let them maintain religious principles in all their purity, and not allow the faith\
   \ to be attacked and morality interpreted according to the social contract or the\
   \ visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene\
-  \ of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\n\
-  Which of the following was the greatest cause of the fears expressed by Metternich\
-  \ in the document above?\n(A) The ideas of personal liberty and nationalism conceived\
+  \ of society.\n—Klemens von Metternich, Political Confession of Faith, 1820\nWhich\
+  \ of the following was the greatest cause of the fears expressed by Metternich in\
+  \ the document above?\n(A) The ideas of personal liberty and nationalism conceived\
   \ during the Enlightenment resulted in radical revolutions that could spread throughout\
   \ Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions\
   \ and shifted the European balance of power. (C) The power of monarchs had grown\
@@ -110,15 +110,15 @@ description: "The following are multiple choice questions (with answers) about h
   \ were all turning to the politicians; the famous Nihilists who made Europe tremble-sons\
   \ of village priests, of the lower middle class, of tradesmen-could not rise above\
   \ the idea of national liberation, and seemed to believe that the world would be\
-  \ delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get\
-  \ out of it with their foolery.\"\nThen, lowering his voice still more, in a few\
-  \ bitter words he described his old dream of fraternity. He had renounced his rank\
-  \ and his fortune; he had gone among workmen, only in the hope of seeing at last\
-  \ the foundation of a new society of labour in common. All the sous in his pockets\
-  \ had long gone to the urchins of the settlement; he had been as tender as a brother\
+  \ delivered-when they had killed their despot&…\n\"Foolery! They'll never get out\
+  \ of it with their foolery.\"\nThen, lowering his voice still more, in a few bitter\
+  \ words he described his old dream of fraternity. He had renounced his rank and\
+  \ his fortune; he had gone among workmen, only in the hope of seeing at last the\
+  \ foundation of a new society of labour in common. All the sous in his pockets had\
+  \ long gone to the urchins of the settlement; he had been as tender as a brother\
   \ with the colliers, smiling at their suspicion, winning them over by his quiet\
   \ workmanlike ways and his dislike of chattering. But decidedly the fusion had not\
-  \ taken place.\nHis voice changed, his eyes grew bright, he fixed them on \xE9tienne,\
+  \ taken place.\nHis voice changed, his eyes grew bright, he fixed them on étienne,\
   \ directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles\
   \ who have won the great lottery prize of a hundred thousand francs have gone off\
   \ at once and invested it, declaring that they are going to live without doing anything!\
@@ -127,7 +127,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ out as much as you like against the rich, you haven't got courage enough to give\
   \ back to the poor the money that luck brings you. You will never be worthy of happiness\
   \ as long as you own anything, and your hatred of the bourgeois proceeds solely\
-  \ from an angry desire to be bourgeois yourselves in their place.\"\n\xE9mile Zola,\
+  \ from an angry desire to be bourgeois yourselves in their place.\"\némile Zola,\
   \ French writer, Germinal, 1885\nThe passage displays the direct concern for the\
   \ welfare of the working classes that was typically a part of which movement?\n\
   (A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step\
@@ -156,13 +156,14 @@ description: "The following are multiple choice questions (with answers) about h
   \ whether Jewish, Christian or Turkish, appear to me no other than human inventions,\
   \ set up to terrify and enslave mankind, and monopolize power and profit.\nI do\
   \ not mean by this declaration to condemn those who believe otherwise; they have\
-  \ the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age\
-  \ of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed\
-  \ a system of checks and balances for government to avoid abuses of power?\n(A)\
-  \ Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\n\
-  A: Let's think step by step. We refer to Wikipedia articles on european history\
-  \ for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively\
+  \ the same right to their belief as I have to mine.\n—Thomas Paine, The Age of Reason,\
+  \ 1794–1795\nWhich of the following Enlightenment philosophes designed a system\
+  \ of checks and balances for government to avoid abuses of power?\n(A) Jean Jacques\
+  \ Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on european history for help.\
+  \ Baron Montesquieu was a 18th centrury French philsopher who wrote extensively\
   \ against the monoplization of power and advocated for a system of checks and balances\
   \ in government to prevent the rise of despotism. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_european_history
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
index 42f6c040..87c27868 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
@@ -1,63 +1,37 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
-
-
-  Q: Which one of the following items is an example of nonmaterial culture?
-
-  (A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not
-  physical objects. The answer is (C).
-
-
-  Q: During the third stage of the demographic transition model, which of the following
-  is true?
-
-  (A) Birth rates increase and population growth rate is less rapid. (B) Birth rates
-  decline and population growth rate is less rapid. (C) Birth rates increase and population
-  growth rate increases. (D) Birth rates decrease and population growth rate increases.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  The demographic transition model models the five different stages of population
-  growth as a country goes through economic development, where the third stage refers
-  to a period of declining birth rates and lower population growth. The answer is
-  (B).
-
-
-  Q: The practice of hiring a foreign third-party service provider to run an operation
-  is called
-
-  (A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  "Offshoring" literally means to move or base some of the activities or processes
-  of a company to a foreign country. The answer is (B).
-
-
-  Q: Which of the following statements is NOT accurate regarding the services provided
-  by local governments in the United States?
-
-  (A) Duplication of efforts occurs often. (B) Social problems of the central city
-  spill over into the surrounding residential suburbs. (C) Inefficiency in providing
-  services occurs often. (D) One neighborhood''s efforts to reduce pollution are always
-  supported by neighboring communities.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  There may be economic, social or political reasons for two neighboring communities
-  and their local governments not agreeing to pollution reduction efforts initiated
-  by one of them. The answer is (D).
-
-
-  Q: The rate of natural increase of a population is found by subtracting the
-
-  (A) crude death rate from the crude birth date. (B) crude birth rate from the crude
-  death rate. (C) doubling time from the crude birth rate. (D) fertility rate from
-  the crude death rate.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  The difference between number of births and deaths gives the population increase
-  at any given time. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_geography
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\nQ: Which one of the following items is an example of nonmaterial\
+  \ culture?\n(A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).\n\
+  A: Let's think step by step. We refer to Wikipedia articles on geography for help.\
+  \ Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not\
+  \ physical objects. The answer is (C).\n\nQ: During the third stage of the demographic\
+  \ transition model, which of the following is true?\n(A) Birth rates increase and\
+  \ population growth rate is less rapid. (B) Birth rates decline and population growth\
+  \ rate is less rapid. (C) Birth rates increase and population growth rate increases.\
+  \ (D) Birth rates decrease and population growth rate increases.\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on geography for help. The demographic\
+  \ transition model models the five different stages of population growth as a country\
+  \ goes through economic development, where the third stage refers to a period of\
+  \ declining birth rates and lower population growth. The answer is (B).\n\nQ: The\
+  \ practice of hiring a foreign third-party service provider to run an operation\
+  \ is called\n(A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on geography for help.\
+  \ \"Offshoring\" literally means to move or base some of the activities or processes\
+  \ of a company to a foreign country. The answer is (B).\n\nQ: Which of the following\
+  \ statements is NOT accurate regarding the services provided by local governments\
+  \ in the United States?\n(A) Duplication of efforts occurs often. (B) Social problems\
+  \ of the central city spill over into the surrounding residential suburbs. (C) Inefficiency\
+  \ in providing services occurs often. (D) One neighborhood's efforts to reduce pollution\
+  \ are always supported by neighboring communities.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on geography for help. There may be economic, social\
+  \ or political reasons for two neighboring communities and their local governments\
+  \ not agreeing to pollution reduction efforts initiated by one of them. The answer\
+  \ is (D).\n\nQ: The rate of natural increase of a population is found by subtracting\
+  \ the\n(A) crude death rate from the crude birth date. (B) crude birth rate from\
+  \ the crude death rate. (C) doubling time from the crude birth rate. (D) fertility\
+  \ rate from the crude death rate.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on geography for help. The difference between number of births and deaths\
+  \ gives the population increase at any given time. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
index 8ec1c5b0..a0a67146 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
@@ -1,67 +1,43 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
-
-
-  Q: Which of the following best states an argument made by James Madison in The Federalist
-  number 10?
-
-  (A) Honest politicians can prevent factions from developing. (B) Factions are more
-  likely to occur in large republics than in small ones. (C) The negative effects
-  of factionalism can be reduced by a republican government. (D) Free elections are
-  the people''s best defense against factionalism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. In the Federalist number 10, James Madison advocated for a representative
-  republican form of government to guard against factionalism. The answer is (C).
-
-
-  Q: The term "budget deficit" refers to the
-
-  (A) annual increase in federal spending on the military (B) amount of interest on
-  the national debt (C) difference between the initial budget proposals made by the
-  president and Congress (D) amount the government spends in excess of its revenues
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. When the goverment spends more than it earns, their difference is the
-  budget deficit. The answer is (D).
-
-
-  Q: Which of the following statements about cabinet departments is FALSE?
-
-  (A) They are established by the legislative branch. (B) Their members often don''t
-  have much influence over presidential decisions. (C) They cannot all be run by leaders
-  who belong to the same political party the president does. (D) Not every federal
-  agency is a cabinet department.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. There is no law stipulating that some cabinet department leaders have
-  to belong to a political party different from that of the president. The answer
-  is (C).
-
-
-  Q: Which of the following cases established the precedent that a defendant must
-  be informed of the right to remain silent, the right to a lawyer, and protection
-  from self-incrimination?
-
-  (A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based
-  on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant''s
-  right to an attorney and protection from self-incrimination. The answer is (D).
-
-
-  Q: Uncertainty over the limits to presidential power is caused primarily by the
-  fact that
-
-  (A) the constitutional definition of those powers is broad and unspecific (B) most
-  people agree that the Constitution places too many limits on presidential power
-  (C) the Supreme Court consistently refuses to rule on cases concerning presidential
-  powers (D) constitutional amendments have greatly increased presidential powers
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. The US Constitution is not very specific about the powers of the president,
-  leading to uncertainty over its limits. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_government_and_politics
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\nQ: Which of the following best states an argument\
+  \ made by James Madison in The Federalist number 10?\n(A) Honest politicians can\
+  \ prevent factions from developing. (B) Factions are more likely to occur in large\
+  \ republics than in small ones. (C) The negative effects of factionalism can be\
+  \ reduced by a republican government. (D) Free elections are the people's best defense\
+  \ against factionalism.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on government and politics for help. In the Federalist number 10, James Madison\
+  \ advocated for a representative republican form of government to guard against\
+  \ factionalism. The answer is (C).\n\nQ: The term \"budget deficit\" refers to the\n\
+  (A) annual increase in federal spending on the military (B) amount of interest on\
+  \ the national debt (C) difference between the initial budget proposals made by\
+  \ the president and Congress (D) amount the government spends in excess of its revenues\n\
+  A: Let's think step by step. We refer to Wikipedia articles on government and politics\
+  \ for help. When the goverment spends more than it earns, their difference is the\
+  \ budget deficit. The answer is (D).\n\nQ: Which of the following statements about\
+  \ cabinet departments is FALSE?\n(A) They are established by the legislative branch.\
+  \ (B) Their members often don't have much influence over presidential decisions.\
+  \ (C) They cannot all be run by leaders who belong to the same political party the\
+  \ president does. (D) Not every federal agency is a cabinet department.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on government and politics\
+  \ for help. There is no law stipulating that some cabinet department leaders have\
+  \ to belong to a political party different from that of the president. The answer\
+  \ is (C).\n\nQ: Which of the following cases established the precedent that a defendant\
+  \ must be informed of the right to remain silent, the right to a lawyer, and protection\
+  \ from self-incrimination?\n(A) Weeks v. United States (B) Betts v. Brady (C) Mapp\
+  \ v. Ohio (D) Miranda v. Arizona\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on government and politics for help. In the landmark Miranda v. Arizona\
+  \ in 1966, the US Supreme Court, based on the Fifth and Sixth Amendment of the US\
+  \ Constitution, guaranteed a defendant's right to an attorney and protection from\
+  \ self-incrimination. The answer is (D).\n\nQ: Uncertainty over the limits to presidential\
+  \ power is caused primarily by the fact that\n(A) the constitutional definition\
+  \ of those powers is broad and unspecific (B) most people agree that the Constitution\
+  \ places too many limits on presidential power (C) the Supreme Court consistently\
+  \ refuses to rule on cases concerning presidential powers (D) constitutional amendments\
+  \ have greatly increased presidential powers\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on government and politics for help. The US Constitution\
+  \ is not very specific about the powers of the president, leading to uncertainty\
+  \ over its limits. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
index f47a83e6..c82b0739 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
@@ -1,64 +1,37 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
-
-
-  Q: Which of the following policies best describes supply-side fiscal policy?
-
-  (A) An increase in the money supply (B) Increased government spending (C) Lower
-  taxes on research and development of new technology (D) Higher taxes on household
-  income
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. Supply-side fiscal policy stimulates the economy by encouraging more production
-  of goods and services through reduction in taxes and deregulation. The answer is
-  (C).
-
-
-  Q: The short-run Phillips curve indicates a
-
-  (A) direct relation between unemployment and inflation (B) direct relation between
-  price and quantity demanded (C) inverse relation between price and quantity demanded
-  (D) inverse relation between unemployment and inflation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. The short-run Phillips curve shows that whenever unemployment decreases below
-  a natural level, the inflation starts increasing, and vice-versa. The answer is
-  (D).
-
-
-  Q: Holding all else equal which of the following monetary policies would be used
-  to boost U.S. exports?
-
-  (A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government
-  securities (D) Lowering tariffs
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. Buying government securities leads to reduction in demand for US dollars from
-  foreign buyers, thereby making it cheaper and hence making US exports more attractive.
-  The answer is (C).
-
-
-  Q: A federal deficit occurs when
-
-  (A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections
-  exceed spending. (D) federal spending exceeds federal tax revenues.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. A federal deficit occurs when federal spending exceeds federal income which
-  is primarily from tax revenues. The answer is (D).
-
-
-  Q: Which of the following is not included in the U.S. GDP?
-
-  (A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel.
-  (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An
-  American pop singer performs a sold-out concert in Paris. (D) A French theatrical
-  production tours dozens of American cities.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. The economic transactions related to the performance of the American pop-singer
-  in Paris happens entirely outside the U.S. and hence is not included in the GDP
-  numbers. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_macroeconomics
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\nQ: Which of the following policies best describes supply-side\
+  \ fiscal policy?\n(A) An increase in the money supply (B) Increased government spending\
+  \ (C) Lower taxes on research and development of new technology (D) Higher taxes\
+  \ on household income\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on macroeconomics for help. Supply-side fiscal policy stimulates the economy by\
+  \ encouraging more production of goods and services through reduction in taxes and\
+  \ deregulation. The answer is (C).\n\nQ: The short-run Phillips curve indicates\
+  \ a\n(A) direct relation between unemployment and inflation (B) direct relation\
+  \ between price and quantity demanded (C) inverse relation between price and quantity\
+  \ demanded (D) inverse relation between unemployment and inflation\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on macroeconomics for help. The short-run\
+  \ Phillips curve shows that whenever unemployment decreases below a natural level,\
+  \ the inflation starts increasing, and vice-versa. The answer is (D).\n\nQ: Holding\
+  \ all else equal which of the following monetary policies would be used to boost\
+  \ U.S. exports?\n(A) Increasing the discount rate (B) Increasing the reserve ratio\
+  \ (C) Buying government securities (D) Lowering tariffs\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on macroeconomics for help. Buying government\
+  \ securities leads to reduction in demand for US dollars from foreign buyers, thereby\
+  \ making it cheaper and hence making US exports more attractive. The answer is (C).\n\
+  \nQ: A federal deficit occurs when\n(A) exports exceed imports. (B) imports exceed\
+  \ exports. (C) federal tax collections exceed spending. (D) federal spending exceeds\
+  \ federal tax revenues.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on macroeconomics for help. A federal deficit occurs when federal spending exceeds\
+  \ federal income which is primarily from tax revenues. The answer is (D).\n\nQ:\
+  \ Which of the following is not included in the U.S. GDP?\n(A) The U.S. military\
+  \ opens a new base in a foreign country with 1000 U.S. personnel. (B) Japanese consumers\
+  \ buy thousands of CDs produced in the United States. (C) An American pop singer\
+  \ performs a sold-out concert in Paris. (D) A French theatrical production tours\
+  \ dozens of American cities.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on macroeconomics for help. The economic transactions related to the\
+  \ performance of the American pop-singer in Paris happens entirely outside the U.S.\
+  \ and hence is not included in the GDP numbers. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
index eb692a09..a73a8290 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_mathematics
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\nQ: Simplify and write the result with a rational denominator:\
   \ $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\\
   frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step.\
@@ -13,7 +13,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to\
   \ double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\
   \nQ: Ten students take a biology test and receive the following scores: 45, 55,\
-  \ 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n\
+  \ 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students’ test scores?\n\
   (A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students\
   \ and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85\
   \ = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies\
@@ -32,5 +32,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common\
   \ multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights\
   \ will come on at the same time $2*7+1=15$ times. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_mathematics
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
index 86c83c82..8c000e6d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
@@ -1,63 +1,40 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
-
-
-  Q: Which of the following is necessarily a characteristic of oligopoly?
-
-  (A) Free entry into and exit from the market (B) A few large producers (C) One producer
-  of a good with no close substitutes (D) A homogenous product
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. An oligopoly is when a market is dominated by just one or a few number of
-  sellers or producers. To get oligopoly, the market should have high barriers to
-  new entry, and the product has differentiation. The answer is (B).
-
-
-  Q: If the government subsidizes producers in a perfectly competitive market, then
-
-  (A) the demand for the product will increase (B) the demand for the product will
-  decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. (A) and (B) are wrong because the demand curve does not change at all. If
-  the government subsidizes producers, the supply will increase, and thus the consumer
-  surplus also increases. The answer is (C).
-
-
-  Q: Which of the following is true of a price floor?
-
-  (A) The price floor shifts the demand curve to the left. (B) An effective floor
-  creates a shortage of the good. (C) The price floor shifts the supply curve of the
-  good to the right. (D) To be an effective floor, it must be set above the equilibrium
-  price.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. Price floor does not shift the demand or shift curve. An effective price floor
-  should be set above the equilibrium price, otherwise the market bears and the floor
-  does not have effective effect. The answer is (D).
-
-
-  Q: The concentration ratio for a monopoly is
-
-  (A) 0 (B) 5 (C) 10 (D) 100
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. The concentration ratio is calculated as the sum of market share of a specific
-  number of largest companies. Monopoly means one company or entity controls the entire
-  market, therefore, the concentration ratio is 100 percent. The answer is (D).
-
-
-  Q: In a competitive labor market for housepainters, which of the following would
-  increase the demand for housepainters?
-
-  (A) An effective minimum wage imposed on this labor market. (B) An increase in the
-  price of gallons of paint. (C) An increase in the construction of new houses. (D)
-  An increase in the price of mechanical painters so long as the output effect exceeds
-  the substitution effect.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. An increase in the construction of new houses means an increase demand of
-  in-house painting, thus increases the demand for housepainters. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_microeconomics
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\nQ: Which of the following is necessarily a characteristic\
+  \ of oligopoly?\n(A) Free entry into and exit from the market (B) A few large producers\
+  \ (C) One producer of a good with no close substitutes (D) A homogenous product\n\
+  A: Let's think step by step. We refer to Wikipedia articles on microeconomics for\
+  \ help. An oligopoly is when a market is dominated by just one or a few number of\
+  \ sellers or producers. To get oligopoly, the market should have high barriers to\
+  \ new entry, and the product has differentiation. The answer is (B).\n\nQ: If the\
+  \ government subsidizes producers in a perfectly competitive market, then\n(A) the\
+  \ demand for the product will increase (B) the demand for the product will decrease\
+  \ (C) the consumer surplus will increase (D) the consumer surplus will decrease\n\
+  A: Let's think step by step. We refer to Wikipedia articles on microeconomics for\
+  \ help. (A) and (B) are wrong because the demand curve does not change at all. If\
+  \ the government subsidizes producers, the supply will increase, and thus the consumer\
+  \ surplus also increases. The answer is (C).\n\nQ: Which of the following is true\
+  \ of a price floor?\n(A) The price floor shifts the demand curve to the left. (B)\
+  \ An effective floor creates a shortage of the good. (C) The price floor shifts\
+  \ the supply curve of the good to the right. (D) To be an effective floor, it must\
+  \ be set above the equilibrium price.\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on microeconomics for help. Price floor does not shift the\
+  \ demand or shift curve. An effective price floor should be set above the equilibrium\
+  \ price, otherwise the market bears and the floor does not have effective effect.\
+  \ The answer is (D).\n\nQ: The concentration ratio for a monopoly is\n(A) 0 (B)\
+  \ 5 (C) 10 (D) 100\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on microeconomics for help. The concentration ratio is calculated as the sum of\
+  \ market share of a specific number of largest companies. Monopoly means one company\
+  \ or entity controls the entire market, therefore, the concentration ratio is 100\
+  \ percent. The answer is (D).\n\nQ: In a competitive labor market for housepainters,\
+  \ which of the following would increase the demand for housepainters?\n(A) An effective\
+  \ minimum wage imposed on this labor market. (B) An increase in the price of gallons\
+  \ of paint. (C) An increase in the construction of new houses. (D) An increase in\
+  \ the price of mechanical painters so long as the output effect exceeds the substitution\
+  \ effect.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics\
+  \ for help. An increase in the construction of new houses means an increase demand\
+  \ of in-house painting, thus increases the demand for housepainters. The answer\
+  \ is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
index f21a183c..92963bd6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
@@ -1,38 +1,39 @@
-dataset_name: high_school_physics
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws\
   \ a current of 2 amps. At what rate is energy being used by the microwave oven?\n\
   (A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy\
   \ usage is known as power; in an dissipative electrical circuit, power is given\
   \ by voltage times current. So in our case, the power is 120 V times 2 amps, or\
   \ 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin.\
-  \ How much work is required to move a charge, Q = +8 \xB5C, from the point (0, 4\
-  \ meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\n\
-  A: Let's think step by step. To calculate the work required to move a charge from\
-  \ one location to another in a fixed electric field, it is enough to calculate the\
-  \ potential difference between the two locations. Here, the potential only depends\
-  \ on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019\
-  s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer\
-  \ as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following\
-  \ conditions will ensure that angular momentum is conserved? I. Conservation of\
-  \ linear momentum II. Zero net external force III. Zero net external torque\n(A)\
-  \ I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think\
-  \ step by step. Torque is defined as the change in angular momentum; if there is\
-  \ zero external torque, angular momentum is conserved. The answer is (D).\n\nQ:\
-  \ A photocell of work function \u03D5 = 2eV is connected to a resistor in series.\
-  \ Light of frequency f = 1 \xD7 10^15 Hz hits a metal plate of the photocell. If\
-  \ the power of the light is P = 100 W, what is the current through the resistor?\n\
-  (A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The\
-  \ only answer above which has units of current is D, 24 A. The answer is (D).\n\n\
-  Q: A pipe full of air is closed at one end. A standing wave is produced in the pipe,\
-  \ causing the pipe to sound a note. Which of the following is a correct statement\
-  \ about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure\
-  \ is at a node, but the particle displacement is at an antinode. (B) The pressure\
-  \ is at an antinode, but the particle displacement is at a node. (C) The pressure\
-  \ and the particle displacement are both at nodes. (D) The pressure and the particle\
-  \ displacement are both at antinodes.\nA: Let's think step by step. At the closed\
-  \ end of the pipe, the particles cannot have any net displacement because the pipe\
-  \ closure stops them. So the particle displacement is at a node. This closure also\
-  \ causes the pressure to be maximal, i.e. an antinode. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_physics
+  \ How much work is required to move a charge, Q = +8 µC, from the point (0, 4 meters)\
+  \ to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\nA: Let's\
+  \ think step by step. To calculate the work required to move a charge from one location\
+  \ to another in a fixed electric field, it is enough to calculate the potential\
+  \ difference between the two locations. Here, the potential only depends on the\
+  \ distance between the charges; it’s $k q_1 q_2 / r$, where $k$ is Coulomb’s constant.\
+  \ Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer as 5.992\
+  \ J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following conditions\
+  \ will ensure that angular momentum is conserved? I. Conservation of linear momentum\
+  \ II. Zero net external force III. Zero net external torque\n(A) I and II only (B)\
+  \ I and III only (C) II and III only (D) III only\nA: Let's think step by step.\
+  \ Torque is defined as the change in angular momentum; if there is zero external\
+  \ torque, angular momentum is conserved. The answer is (D).\n\nQ: A photocell of\
+  \ work function ϕ = 2eV is connected to a resistor in series. Light of frequency\
+  \ f = 1 × 10^15 Hz hits a metal plate of the photocell. If the power of the light\
+  \ is P = 100 W, what is the current through the resistor?\n(A) 2:00 AM (B) 6:00\
+  \ AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The only answer above which\
+  \ has units of current is D, 24 A. The answer is (D).\n\nQ: A pipe full of air is\
+  \ closed at one end. A standing wave is produced in the pipe, causing the pipe to\
+  \ sound a note. Which of the following is a correct statement about the wave’s properties\
+  \ at the closed end of the pipe?\n(A) The pressure is at a node, but the particle\
+  \ displacement is at an antinode. (B) The pressure is at an antinode, but the particle\
+  \ displacement is at a node. (C) The pressure and the particle displacement are\
+  \ both at nodes. (D) The pressure and the particle displacement are both at antinodes.\n\
+  A: Let's think step by step. At the closed end of the pipe, the particles cannot\
+  \ have any net displacement because the pipe closure stops them. So the particle\
+  \ displacement is at a node. This closure also causes the pressure to be maximal,\
+  \ i.e. an antinode. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
index 706db0ec..b54a6c38 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
@@ -1,72 +1,48 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
-
-
-  Q: Pascale is interested in the processing strategies children use to learn new
-  information. Pascale would best be classified as what type of psychologist?
-
-  (A) sociocultural (B) clinical (C) cognitive (D) behaviorist
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Sociocultural psychologist focuses on the effect of societal factors on people.
-  Clinical psychologist focuses on people with mental issues. Cognitive psychologist
-  focuses on how people think and learn, including the processing strategies. Behaviorist
-  focuses more on the environment and experience effect on people. The answer is (C).
-
-
-  Q: According to Caplan''s model of consultee-centered case consultation, the consultant
-  is primarily interested in
-
-  (A) identifying the causes and solutions of the client''s presenting problems (B)
-  identifying and eliminating the causes of the consultee''s difficulties in handling
-  a problem (C) establishing a hierarchy of authority to enable effective decision
-  making (D) presenting a single, well-defined and unambiguous course of action for
-  the consultant to overcome skills deficits
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Caplan defines two type of consultation. Client-centered case consultation aims
-  to handle client''s problems, while consultee-centered case consultation aims to
-  identify the reason of client''s difficulty to solve problems. The answer is (B).
-
-
-  Q: According to the Individuals with Disabilities Education Improvement Act, which
-  of the following must an educational agency do before it changes the educational
-  placement of a student with a disability?
-
-  (A) Give the child a trial period in the new environment (B) Notify the parents
-  in writing (C) Obtain school board approval (D) Obtain parental consent
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  When the decision to change the educational placement of a student with a disability
-  is made, the educational agency must notify the parents in writing on that date.
-  The answer is (B).
-
-
-  Q: While swimming in the ocean, Ivan is frightened by a dark shadow in the water
-  even before he has the chance to identify what the shadow is. The synaptic connections
-  taking place during this incident of fright are best described by which of the following?
-
-  (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are
-  sent from the thalamus to the "what" and "where" pathways. (C) Messages are sent
-  from the parasympathetic nervous system to the cerebral cortex. (D) Messages are
-  sent from the frontal lobes to the pituitary gland.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Our neural system has a mechanism that can respond immediate emotional signal before
-  going to the thought center. In the Ivan''s case, messages travel directly from
-  thalamus to amygdala. The answer is (A).
-
-
-  Q: Ani believes that her attitudes and behavior play a central role in what happens
-  to her. Such a belief is likely to be associated with
-
-  (A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal
-  locus of control.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  People with an external locus of control believes fate and luck play an important
-  role in their lives, while people with an internal locus of control believes they
-  control their lives. The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_psychology
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\nQ: Pascale is interested in the processing strategies children\
+  \ use to learn new information. Pascale would best be classified as what type of\
+  \ psychologist?\n(A) sociocultural (B) clinical (C) cognitive (D) behaviorist\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Sociocultural psychologist focuses on the effect of societal factors on people.\
+  \ Clinical psychologist focuses on people with mental issues. Cognitive psychologist\
+  \ focuses on how people think and learn, including the processing strategies. Behaviorist\
+  \ focuses more on the environment and experience effect on people. The answer is\
+  \ (C).\n\nQ: According to Caplan's model of consultee-centered case consultation,\
+  \ the consultant is primarily interested in\n(A) identifying the causes and solutions\
+  \ of the client's presenting problems (B) identifying and eliminating the causes\
+  \ of the consultee's difficulties in handling a problem (C) establishing a hierarchy\
+  \ of authority to enable effective decision making (D) presenting a single, well-defined\
+  \ and unambiguous course of action for the consultant to overcome skills deficits\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Caplan defines two type of consultation. Client-centered case consultation aims\
+  \ to handle client's problems, while consultee-centered case consultation aims to\
+  \ identify the reason of client's difficulty to solve problems. The answer is (B).\n\
+  \nQ: According to the Individuals with Disabilities Education Improvement Act, which\
+  \ of the following must an educational agency do before it changes the educational\
+  \ placement of a student with a disability?\n(A) Give the child a trial period in\
+  \ the new environment (B) Notify the parents in writing (C) Obtain school board\
+  \ approval (D) Obtain parental consent\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on psychology for help. When the decision to change the educational\
+  \ placement of a student with a disability is made, the educational agency must\
+  \ notify the parents in writing on that date. The answer is (B).\n\nQ: While swimming\
+  \ in the ocean, Ivan is frightened by a dark shadow in the water even before he\
+  \ has the chance to identify what the shadow is. The synaptic connections taking\
+  \ place during this incident of fright are best described by which of the following?\n\
+  (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are\
+  \ sent from the thalamus to the \"what\" and \"where\" pathways. (C) Messages are\
+  \ sent from the parasympathetic nervous system to the cerebral cortex. (D) Messages\
+  \ are sent from the frontal lobes to the pituitary gland.\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on psychology for help. Our neural system\
+  \ has a mechanism that can respond immediate emotional signal before going to the\
+  \ thought center. In the Ivan's case, messages travel directly from thalamus to\
+  \ amygdala. The answer is (A).\n\nQ: Ani believes that her attitudes and behavior\
+  \ play a central role in what happens to her. Such a belief is likely to be associated\
+  \ with\n(A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an\
+  \ internal locus of control.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on psychology for help. People with an external locus of control believes\
+  \ fate and luck play an important role in their lives, while people with an internal\
+  \ locus of control believes they control their lives. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
index 37e21061..918f6ac3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
@@ -1,88 +1,58 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
-
-
-  Q: A new smartwatch is manufactured in one part of a factory, then secured for shipping
-  in another, independent part of the factory. The weight of the smartwatch has a
-  mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging
-  (box, user''s guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation
-  of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging
-  would have the following mean and standard deviation:
-
-  (A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation
-  3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard
-  deviation 6.1 grams
-
-  A: Let''s think step by step. Since the weight of the watch and the weight of the
-  packaging are independent random variables, the mean and variance of their sum is
-  equal to the sum of their individual means and variances. So the mean is 62 + 456
-  = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation
-  of 6.1 grams. The answer is (C).
-
-
-  Q: After a frost warning was issued, the owner of a large orange grove asked his
-  workers to spray all his trees with water. The water was supposed to freeze and
-  form a protective covering of ice around the orange blossom. Nevertheless, the owner
-  suspected that some trees suffered considerable damage due to the frost. To estimate
-  the proportion of trees that suffered more than 50 percent damage due to the frost,
-  he took a random sample of 100 trees from his grove. What is the response variable
-  in this experiment?
-
-  (A) The proportion of trees that suffered more than 50 percent damage due to frost.
-  (B) The number of trees affected by the frost. (C) The number of trees sampled from
-  the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage
-  or at most 50 percent damage.
-
-  A: Let''s think step by step. In this experiment, the response variable is what
-  is measured. For each tree, what is measured is whether or not it suffered more
-  than 50 percent damage due to the frost. The answer is (D).
-
-
-  Q: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and
-  var(Y) = 12. What are the expected value and variance of the random variable X +
-  Y?
-
-  (A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X +
-  Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.
-
-  A: Let''s think step by step. While means of sums of random variables add (regardless
-  of whether the variables are independent) in order to determine the variance of
-  a sum of random variables, we need to know not just their individual variances but
-  the covariance of the two variables, which is not given in this problem. The answer
-  is (D).
-
-
-  Q: Which of the following sets has the smallest standard deviation? Which has the
-  largest?
-
-  I: {1,2,3}
-
-  II: {-10,10}
-
-  III: {100}
-
-  (A) I, II (B) II, III (C) III, I (D) III, II
-
-  A: Let''s think step by step. The variance of distribution I is the expected squared
-  deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution
-  II is 10^2 (because both elements are 10 away from the mean of zero). The variance
-  of distribution III is 0, since it has a single entry. So distribution III has the
-  smallest standard deviation and distribution II has the largest. The answer is (D).
-
-
-  Q: Which of the following is a correct statement about correlation?
-
-  (A) If the slope of the regression line is exactly 1, then the correlation is exactly
-  1. (B) If the correlation is 0, then the slope of the regression line is undefined.
-  (C) Switching which variable is called x and which is called y changes the sign
-  of the correlation. (D) The correlation r is equal to the slope of the regression
-  line when z-scores for the y-variable are plotted against z-scores for the x-variable.
-
-  A: Let''s think step by step. Statement A is false because the slope of the regression
-  line being exactly 1 can occur even when the two variables are not perfectly correlated.
-  Statement B is false because uncorrelated variables regression lines can have slope
-  zero. Statement C is false because correlation is symmetric in the two random variables.
-  The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_statistics
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\nQ: A new smartwatch is manufactured in one part of a factory,\
+  \ then secured for shipping in another, independent part of the factory. The weight\
+  \ of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams.\
+  \ The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean\
+  \ of 456 grams and a standard deviation of 6 grams. Together, the distribution of\
+  \ the weight of the smartwatch and its packaging would have the following mean and\
+  \ standard deviation:\n(A) Mean 518 grams; standard deviation 7.0 grams (B) Mean\
+  \ 518 grams; standard deviation 3.5 grams (C) Mean 518 grams; standard deviation\
+  \ 6.1 grams (D) Mean 394 grams; standard deviation 6.1 grams\nA: Let's think step\
+  \ by step. Since the weight of the watch and the weight of the packaging are independent\
+  \ random variables, the mean and variance of their sum is equal to the sum of their\
+  \ individual means and variances. So the mean is 62 + 456 = 518 grams, and the variances\
+  \ is 1.0^2 + 6.0^2 = 37, leading to a standard deviation of 6.1 grams. The answer\
+  \ is (C).\n\nQ: After a frost warning was issued, the owner of a large orange grove\
+  \ asked his workers to spray all his trees with water. The water was supposed to\
+  \ freeze and form a protective covering of ice around the orange blossom. Nevertheless,\
+  \ the owner suspected that some trees suffered considerable damage due to the frost.\
+  \ To estimate the proportion of trees that suffered more than 50 percent damage\
+  \ due to the frost, he took a random sample of 100 trees from his grove. What is\
+  \ the response variable in this experiment?\n(A) The proportion of trees that suffered\
+  \ more than 50 percent damage due to frost. (B) The number of trees affected by\
+  \ the frost. (C) The number of trees sampled from the grove. (D) For each sampled\
+  \ tree, whether it suffered more than 50 percent damage or at most 50 percent damage.\n\
+  A: Let's think step by step. In this experiment, the response variable is what is\
+  \ measured. For each tree, what is measured is whether or not it suffered more than\
+  \ 50 percent damage due to the frost. The answer is (D).\n\nQ: Suppose X and Y are\
+  \ random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What\
+  \ are the expected value and variance of the random variable X + Y?\n(A) E(X + Y)\
+  \ = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X + Y) = 99, var(X\
+  \ + Y) = 17 (D) There is insufficient information to answer this question.\nA: Let's\
+  \ think step by step. While means of sums of random variables add (regardless of\
+  \ whether the variables are independent) in order to determine the variance of a\
+  \ sum of random variables, we need to know not just their individual variances but\
+  \ the covariance of the two variables, which is not given in this problem. The answer\
+  \ is (D).\n\nQ: Which of the following sets has the smallest standard deviation?\
+  \ Which has the largest?\nI: {1,2,3}\nII: {-10,10}\nIII: {100}\n(A) I, II (B) II,\
+  \ III (C) III, I (D) III, II\nA: Let's think step by step. The variance of distribution\
+  \ I is the expected squared deviation from its mean (which is 2), so the variance\
+  \ is 2/3 . The variance of distribution II is 10^2 (because both elements are 10\
+  \ away from the mean of zero). The variance of distribution III is 0, since it has\
+  \ a single entry. So distribution III has the smallest standard deviation and distribution\
+  \ II has the largest. The answer is (D).\n\nQ: Which of the following is a correct\
+  \ statement about correlation?\n(A) If the slope of the regression line is exactly\
+  \ 1, then the correlation is exactly 1. (B) If the correlation is 0, then the slope\
+  \ of the regression line is undefined. (C) Switching which variable is called x\
+  \ and which is called y changes the sign of the correlation. (D) The correlation\
+  \ r is equal to the slope of the regression line when z-scores for the y-variable\
+  \ are plotted against z-scores for the x-variable.\nA: Let's think step by step.\
+  \ Statement A is false because the slope of the regression line being exactly 1\
+  \ can occur even when the two variables are not perfectly correlated. Statement\
+  \ B is false because uncorrelated variables regression lines can have slope zero.\
+  \ Statement C is false because correlation is symmetric in the two random variables.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
index 951666d1..d8d0bfbb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_us_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\nQ: This question refers to the following information.\n\
   I come not to urge personal claims, nor to seek individual benefits; I appear as\
   \ the advocate of those who cannot plead their own cause; I come as the friend of\
@@ -8,126 +8,127 @@ description: "The following are multiple choice questions (with answers) about h
   \ jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed\
   \ beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses.\
   \ I am the Revelation of hundreds of wailing, suffering creatures, hidden in your\
-  \ private dwellings, and in pens and cabins\u2014shut out, cut off from all healing\
-  \ influences, from all mind-restoring cares.\u2026 Could their melancholy histories\
-  \ be spread before you as revealed to my grieved spirit during the last three months,\
-  \ how promptly, how earnestly would you search out the most approved means of relief;\
-  \ how trifling, how insignificant, by comparison, would appear the sacrifices you\
-  \ are asked to make; how would a few dimes and dollars, gathered from each citizen,\
-  \ diminish in value as a possession, compared with the certain benefits and vast\
-  \ good to be secured for the suffering insane...by the consecration and application\
-  \ of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014\
-  Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of\
-  \ the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\n\
-  Dorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C)\
-  \ Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on us history for help. Both Dorothea Dix and Clara barton are American\
-  \ nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\
-  \"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much\
-  \ Speculation & a great diversity of Sentiments in this and neighboring Governments;\
-  \ some vindicating & others condemning it; some charitably alleviating the Crime,\
-  \ & others maliciously painting it in the most odious & detestable Colours, we think\
-  \ it our duty to lay before the Publick, the whole Matter as it appeared, & still\
-  \ appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable\
-  \ Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend\
-  \ them & an utter insensibility to human Distresses, let us consider a few more\
-  \ recent Facts. When we found the last Summer that we were likely to get no Assistance\
-  \ from the Government, some Volunteers went out at our own Expense, determined to\
-  \ drive our Enemies from our Borders; & when we came near to the great Island, we\
-  \ understood that a Number of their Warriors had gone out against our Frontiers.\
-  \ Upon this we returned and came up with them and fought with them at the Munfey\
-  \ Hill where we lost some of our Men & killed some of their Warriors & thereby saved\
-  \ our Frontiers from this Story in another Expedition. But no sooner had we destroyed\
-  \ their Provisions on the great Island, & ruined their trade with the good People\
-  \ at Bethlehem, but these very Indians, who were justly suspected of having murdered\
-  \ our Friends in Northampton County, were by the Influence of some Quakers taken\
-  \ under the Protection of the Government to screen them from the Resentments of\
-  \ the Friends and Relations of the Murdered, & to support them thro the Winter.\"\
-  \n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this\
-  \ context should be read as an explanation, not an admission of guilt or regret.\n\
-  The sentiments expressed in the explanation above reflect which of the ongoing tensions\
-  \ during the colonial period of American history?\n(A) Tensions between British\
-  \ policies and the aspirations of North American colonists. (B) Tensions between\
-  \ American Indians allied with the French and those allied with the British. (C)\
-  \ Tensions between freed African Americans and white planters. (D) Tensions between\
-  \ backcountry settlers and elites within colonial America.\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on us history for help. After the French\
-  \ and Indian War, the Scotch-Irish settlers attacked American Indians. After the\
-  \ attacks on the Conestoga, about 250 Paxton Boys present their grievances to the\
-  \ Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited\
-  \ resentiment at local elites. The answer is (D).\n\nQ: This question refers to\
-  \ the following information.\nOur leaders talk about stopping aggression from the\
-  \ north, but this was a struggle among groups of Vietnamese until we intervened.\
-  \ We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill\
-  \ them and demolish their country to do it. As the native people survey bombed-out\
-  \ villages, women and children burned by napalm, rice crops destroyed and cities\
-  \ overrun with our military personnel, they are doubtless saying secretly of the\
-  \ Vietcong guerillas and of the American forces, \"A plague on both your houses.\"\
-  \ \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps,\
-  \ and confine our military action to holding operations on the ground. Bombing the\
-  \ north has failed to halt or seriously check the flow of troops to the south and\
-  \ may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator\
-  \ George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following\
-  \ opinions from the 1960s most directly reflects the perspective of George McGovern's\
-  \ speech?\n(A) Americans must maximize their technological edge in Vietnam. (B)\
-  \ American bombing in Vietnam is step by step leading to progress in the war. (C)\
-  \ American bombing in Vietnam is a failure. (D) America must not give in to defeatism\
-  \ about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on us history for help. \"Stop the bombing\" and \"Bombing the north\
-  \ has failed to halt or seriously check the flow of troops to the south\" indicate\
-  \ that the perspective of George McGovern's speech is that Amerian bombing in Vietnam\
-  \ is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\
-  \"In the new Code of Laws which I suppose it will be necessary for you to make I\
-  \ desire you would Remember the Ladies, and be more generous and favorable to them\
-  \ than your ancestors. Do not put such unlimited power into the hands of the Husbands.\
-  \ Remember all Men would be tyrants if they could. If particular care and attention\
-  \ is not paid to the Ladies we are determined to foment a Rebellion, and will not\
-  \ hold ourselves bound by any Laws in which we have no voice, or Representation.\"\
-  \nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman\
-  \ has placed us in a most anomalous position. Women invested with the rights of\
-  \ citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary\
-  \ line, are subjects in the next. In some States, a married woman may hold property\
-  \ and transact business in her own name; in others, her earnings belong to her husband.\
-  \ In some States, a woman may testify against her husband, sue and be sued in the\
-  \ courts; in others, she has no redress in case of damage to person, property, or\
-  \ character. In case of divorce on account of adultery in the husband, the innocent\
-  \ wife is held to possess no right to children or property, unless by special decree\
-  \ of the court. But in no State of the Union has the wife the right to her own person,\
-  \ or to any part of the joint earnings of the co-partnership during the life of\
-  \ her husband. In some States women may enter the law schools and practice in the\
-  \ courts; in others they are forbidden. In some universities girls enjoy equal educational\
-  \ advantages with boys, while many of the proudest institutions in the land deny\
-  \ them admittance, though the sons of China, Japan and Africa are welcomed there.\
-  \ But the privileges already granted in the several States are by no means secure.\"\
-  \nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments\
-  \ expressed in the second excerpt by Susan B. Anthony are most likely in support\
-  \ of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D)\
-  \ prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us\
-  \ history for help. The above information mentioned that women are in an anomalous\
-  \ position in terms of legislation. Women's earnings do not belong to themselves,\
-  \ or they cannot testify against her husbands. Susan believes women should have\
-  \ equal legal rights as men. The answer is (B).\n\nQ: This question refers to the\
-  \ following information.\n\"Society in every state is a blessing, but government\
-  \ even in its best state is but a necessary evil; in its worst state an intolerable\
-  \ one; for when we suffer, or are exposed to the same miseries by a government,\
-  \ which we might expect in a country without government, our calamity is heightened\
-  \ by reflecting that we furnish the means by which we suffer. Government, like dress,\
-  \ is the badge of lost innocence; the palaces of kings are built on the ruins of\
-  \ the bowers of paradise. For were the impulses of conscience clear, uniform, and\
-  \ irresistibly obeyed, man would need no other lawgiver; but that not being the\
-  \ case, he finds it necessary to surrender up a part of his property to furnish\
-  \ means for the protection of the rest; and this he is induced to do by the same\
-  \ prudence which in every other case advises him out of two evils to choose the\
-  \ least. Wherefore, security being the true design and end of government, it unanswerably\
-  \ follows that whatever form thereof appears most likely to ensure it to us, with\
-  \ the least expense and greatest benefit, is preferable to all others.\"\nThomas\
-  \ Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above\
-  \ were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized\
-  \ response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal\
-  \ response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\n\
-  A: Let's think step by step. We refer to Wikipedia articles on us history for help.\
-  \ Anti-Federalists do not believe centralized government power, and suspect Washington's\
-  \ military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion\
-  \ happen before the Revolution and they can be ruled out. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_us_history
+  \ private dwellings, and in pens and cabins—shut out, cut off from all healing influences,\
+  \ from all mind-restoring cares.… Could their melancholy histories be spread before\
+  \ you as revealed to my grieved spirit during the last three months, how promptly,\
+  \ how earnestly would you search out the most approved means of relief; how trifling,\
+  \ how insignificant, by comparison, would appear the sacrifices you are asked to\
+  \ make; how would a few dimes and dollars, gathered from each citizen, diminish\
+  \ in value as a possession, compared with the certain benefits and vast good to\
+  \ be secured for the suffering insane...by the consecration and application of a\
+  \ sufficient fund to the construction of a suitable hospital.…\n—Dorothea Dix, Memorial\
+  \ Soliciting a State Hospital for the Protection and Cure of the Insane,\nSubmitted\
+  \ to the General Assembly of North Carolina, November 1848\nDorothea Dix can best\
+  \ be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C) Shirley Temple (D)\
+  \ Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ us history for help. Both Dorothea Dix and Clara barton are American nurses. The\
+  \ answer is (B).\n\nQ: This question refers to the following information.\n\"As\
+  \ our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation\
+  \ & a great diversity of Sentiments in this and neighboring Governments; some vindicating\
+  \ & others condemning it; some charitably alleviating the Crime, & others maliciously\
+  \ painting it in the most odious & detestable Colours, we think it our duty to lay\
+  \ before the Publick, the whole Matter as it appeared, & still appears, to us. .\
+  \ . .\n\"If these things are not sufficient to prove an unjustifiable Attachment\
+  \ in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an\
+  \ utter insensibility to human Distresses, let us consider a few more recent Facts.\
+  \ When we found the last Summer that we were likely to get no Assistance from the\
+  \ Government, some Volunteers went out at our own Expense, determined to drive our\
+  \ Enemies from our Borders; & when we came near to the great Island, we understood\
+  \ that a Number of their Warriors had gone out against our Frontiers. Upon this\
+  \ we returned and came up with them and fought with them at the Munfey Hill where\
+  \ we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers\
+  \ from this Story in another Expedition. But no sooner had we destroyed their Provisions\
+  \ on the great Island, & ruined their trade with the good People at Bethlehem, but\
+  \ these very Indians, who were justly suspected of having murdered our Friends in\
+  \ Northampton County, were by the Influence of some Quakers taken under the Protection\
+  \ of the Government to screen them from the Resentments of the Friends and Relations\
+  \ of the Murdered, & to support them thro the Winter.\"\n—\"Apology of the Paxton\
+  \ Boys\" (pamphlet), 1764 (Note: \"apology\" in this context should be read as an\
+  \ explanation, not an admission of guilt or regret.\nThe sentiments expressed in\
+  \ the explanation above reflect which of the ongoing tensions during the colonial\
+  \ period of American history?\n(A) Tensions between British policies and the aspirations\
+  \ of North American colonists. (B) Tensions between American Indians allied with\
+  \ the French and those allied with the British. (C) Tensions between freed African\
+  \ Americans and white planters. (D) Tensions between backcountry settlers and elites\
+  \ within colonial America.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us history for help. After the French and Indian War, the Scotch-Irish settlers\
+  \ attacked American Indians. After the attacks on the Conestoga, about 250 Paxton\
+  \ Boys present their grievances to the Pennsylvania legislature. As mentioned in\
+  \ the information, the Paxton Boys cited resentiment at local elites. The answer\
+  \ is (D).\n\nQ: This question refers to the following information.\nOur leaders\
+  \ talk about stopping aggression from the north, but this was a struggle among groups\
+  \ of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from\
+  \ Ho Chi Minh even if we have to kill them and demolish their country to do it.\
+  \ As the native people survey bombed-out villages, women and children burned by\
+  \ napalm, rice crops destroyed and cities overrun with our military personnel, they\
+  \ are doubtless saying secretly of the Vietcong guerillas and of the American forces,\
+  \ \"A plague on both your houses.\" … Stop the bombing, north and south, end search\
+  \ and destroy offensive sweeps, and confine our military action to holding operations\
+  \ on the ground. Bombing the north has failed to halt or seriously check the flow\
+  \ of troops to the south and may, in fact, have prompted a much greater war effort\
+  \ by Hanoi.\n—Senator George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\n\
+  Which of the following opinions from the 1960s most directly reflects the perspective\
+  \ of George McGovern's speech?\n(A) Americans must maximize their technological\
+  \ edge in Vietnam. (B) American bombing in Vietnam is step by step leading to progress\
+  \ in the war. (C) American bombing in Vietnam is a failure. (D) America must not\
+  \ give in to defeatism about the war in Vietnam.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on us history for help. \"Stop the bombing\" and \"\
+  Bombing the north has failed to halt or seriously check the flow of troops to the\
+  \ south\" indicate that the perspective of George McGovern's speech is that Amerian\
+  \ bombing in Vietnam is a failure. The answer is (C).\n\nQ: This question refers\
+  \ to the following information.\n\"In the new Code of Laws which I suppose it will\
+  \ be necessary for you to make I desire you would Remember the Ladies, and be more\
+  \ generous and favorable to them than your ancestors. Do not put such unlimited\
+  \ power into the hands of the Husbands. Remember all Men would be tyrants if they\
+  \ could. If particular care and attention is not paid to the Ladies we are determined\
+  \ to foment a Rebellion, and will not hold ourselves bound by any Laws in which\
+  \ we have no voice, or Representation.\"\nAbigail Adams, in a letter to John Adams,\
+  \ 1776\n\"Special legislation for woman has placed us in a most anomalous position.\
+  \ Women invested with the rights of citizens in one section—voters, jurors, office-holders—crossing\
+  \ an imaginary line, are subjects in the next. In some States, a married woman may\
+  \ hold property and transact business in her own name; in others, her earnings belong\
+  \ to her husband. In some States, a woman may testify against her husband, sue and\
+  \ be sued in the courts; in others, she has no redress in case of damage to person,\
+  \ property, or character. In case of divorce on account of adultery in the husband,\
+  \ the innocent wife is held to possess no right to children or property, unless\
+  \ by special decree of the court. But in no State of the Union has the wife the\
+  \ right to her own person, or to any part of the joint earnings of the co-partnership\
+  \ during the life of her husband. In some States women may enter the law schools\
+  \ and practice in the courts; in others they are forbidden. In some universities\
+  \ girls enjoy equal educational advantages with boys, while many of the proudest\
+  \ institutions in the land deny them admittance, though the sons of China, Japan\
+  \ and Africa are welcomed there. But the privileges already granted in the several\
+  \ States are by no means secure.\"\nSusan B. Anthony, \"Declaration of Rights for\
+  \ Women,\" July 4, 1876\nThe sentiments expressed in the second excerpt by Susan\
+  \ B. Anthony are most likely in support of\n(A) the Equal Rights Amendment (B) universal\
+  \ suffrage (C) states' rights (D) prohibition\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on us history for help. The above information mentioned\
+  \ that women are in an anomalous position in terms of legislation. Women's earnings\
+  \ do not belong to themselves, or they cannot testify against her husbands. Susan\
+  \ believes women should have equal legal rights as men. The answer is (B).\n\nQ:\
+  \ This question refers to the following information.\n\"Society in every state is\
+  \ a blessing, but government even in its best state is but a necessary evil; in\
+  \ its worst state an intolerable one; for when we suffer, or are exposed to the\
+  \ same miseries by a government, which we might expect in a country without government,\
+  \ our calamity is heightened by reflecting that we furnish the means by which we\
+  \ suffer. Government, like dress, is the badge of lost innocence; the palaces of\
+  \ kings are built on the ruins of the bowers of paradise. For were the impulses\
+  \ of conscience clear, uniform, and irresistibly obeyed, man would need no other\
+  \ lawgiver; but that not being the case, he finds it necessary to surrender up a\
+  \ part of his property to furnish means for the protection of the rest; and this\
+  \ he is induced to do by the same prudence which in every other case advises him\
+  \ out of two evils to choose the least. Wherefore, security being the true design\
+  \ and end of government, it unanswerably follows that whatever form thereof appears\
+  \ most likely to ensure it to us, with the least expense and greatest benefit, is\
+  \ preferable to all others.\"\nThomas Paine, Common Sense, 1776\nWhich of the following\
+  \ \"miseries\" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary\
+  \ era?\n(A) Organized response to Bacon's Rebellion (B) Federal response to Shays's\
+  \ Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response to\
+  \ Pontiac's Rebellion\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us history for help. Anti-Federalists do not believe centralized government\
+  \ power, and suspect Washington's military response to Whiskey Rebellion. Bacon's\
+  \ Rebellion and Pontiac's Rebellion happen before the Revolution and they can be\
+  \ ruled out. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
index 1cf4bbdb..6db82ea6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_world_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\nQ: This question refers to the following information.\n\
   \"At least one of the [world's] societies would have to somehow enormously increase\
   \ its productivity [in order to achieve global hegemony]. That quantum jump would\
@@ -8,75 +8,75 @@ description: "The following are multiple choice questions (with answers) about h
   \ be accomplished by exploiting the ecosystems, mineral resources, and human assets\
   \ of whole continents outside the lands of the society making the jump. Western\
   \ Europe did just that by means of its brutality and guns and, more important, by\
-  \ geographical and ecological luck.\"\nCopyright \xA9 2015 Cambridge University\
-  \ Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum\
-  \ jump\" mentioned in the passage most directly contributed to which of the following\
-  \ developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes\
-  \ through the collapse of the established state structure (B) An increase in the\
-  \ population of the world through more plentiful supplies of food (C) The spread\
-  \ of Chinese and Indian belief systems across the world (D) An increase in social\
-  \ unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history\
-  \ for help. The \"quantum jump\" mentioned in the passage refers to the conquest\
-  \ of the New World and the Columbian Exchange. Choice (A) and (C) did not happen\
-  \ in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This\
-  \ question refers to the following information.\n\"The struggle against neo-colonialism\
-  \ is not aimed at excluding the capital of the developed world from operating in\
-  \ less developed countries. It is aimed at preventing the financial power of the\
-  \ developed countries being used in such a way as to impoverish the less developed.\n\
-  Non-alignment, as practiced by Ghana and many other countries, is based on co-operation\
-  \ with all States whether they be capitalist, socialist or have a mixed economy.\
-  \ Such a policy, therefore, involves foreign investment from capitalist countries,\
-  \ but it must be invested in accordance with a national plan drawn up by the government\
-  \ of the non-aligned State with its own interests in mind. The issue is not what\
-  \ return the foreign investor receives on his investments\u2026The question is one\
-  \ of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\
-  \nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best\
-  \ context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization\
-  \ (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step.\
-  \ We refer to Wikipedia articles on world history for help. The passage expresses\
-  \ a point that the successful fight against neo-colonialism were in danger and the\
-  \ newly independent nations like Ghana may be re-colonized via financial power of\
-  \ the developed countries. The answer is (B).\n\nQ: This question refers to the\
-  \ following information.\n\"Indeed, as both the fatwas of distinguished [scholars]\
-  \ who base their opinion on reason and tradition alike and the consensus of the\
-  \ Sunni community agree that the ancient obligation of extirpation, extermination,\
-  \ and expulsion of evil innovation must be the aim of our exalted aspiration, for\
-  \ \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in\
-  \ accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces\
-  \ evil innovation into our order must be expelled\" and \"Whosoever does aught against\
-  \ our order must be expelled,\" action has become necessary and exigent\u2026\"\n\
-  Letter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from\
-  \ Selim I is most clearly an example of which of the following?\n(A) The maintenance\
-  \ of military supremacy at all costs (B) Expanding tensions between religious sects\
-  \ (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking\
-  \ efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on world history for help. The passage is an example of expanding tensions\
-  \ between Selim and Ismail. In the passage the Selim references the fatwa and the\
-  \ consensus of the Sunni community to against whosoever introduces evil. The answer\
-  \ is (B).\n\nQ: This question refers to the following information.\n\"The real grievance\
-  \ of the worker is the insecurity of his existence; he is not sure that he will\
-  \ always have work, he is not sure that he will always be healthy, and he foresees\
-  \ that he will one day be old and unfit to work. If he falls into poverty, even\
-  \ if only through a prolonged illness, he is then completely helpless, exam_ins\
-  \ to his own devices, and society does not currently recognize any real obligation\
-  \ towards him beyond the usual help for the poor, even if he has been working all\
-  \ the time ever so faithfully and diligently. The usual help for the poor, however,\
-  \ leaves a lot to be desired, especially in large cities, where it is very much\
-  \ worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely\
-  \ made this speech in reaction to which of the following issues?\n(A) Social acceptance\
-  \ of child labor (B) Declining life expectancy in Germany (C) Criticisms of German\
-  \ trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's\
-  \ think step by step. We refer to Wikipedia articles on world history for help.\
-  \ The passage talks about the grievance of the work under the industrial capitalism.\
-  \ The answer is (D).\n\nQ: This question refers to the following information.\n\
-  He contains all works and desires and all perfumes and all tastes. He enfolds the\
-  \ whole universe and in silence is loving to all. This is the Spirit that is in\
-  \ my heart, this is Brahman. To him I shall come when I go beyond this life, and\
-  \ to him will come he who has faith and doubts not.\n\u2014The Upanishads, India,\
-  \ c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism\
-  \ (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We\
-  \ refer to Wikipedia articles on world history for help. Brahman refers to the ultimate\
-  \ reality of all things in the Hindu religion. In contrast, Buddhism does not have\
-  \ a concept of supreme God. The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_world_history
+  \ geographical and ecological luck.\"\nCopyright © 2015 Cambridge University Press.\n\
+  Alfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\" mentioned\
+  \ in the passage most directly contributed to which of the following developments\
+  \ in the period 1450–1750 C.E.?\n(A) A breakdown in trade routes through the collapse\
+  \ of the established state structure (B) An increase in the population of the world\
+  \ through more plentiful supplies of food (C) The spread of Chinese and Indian belief\
+  \ systems across the world (D) An increase in social unrest\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on world history for help. The \"quantum\
+  \ jump\" mentioned in the passage refers to the conquest of the New World and the\
+  \ Columbian Exchange. Choice (A) and (C) did not happen in history. Choice (C) refers\
+  \ to the human assets. The answer is (B).\n\nQ: This question refers to the following\
+  \ information.\n\"The struggle against neo-colonialism is not aimed at excluding\
+  \ the capital of the developed world from operating in less developed countries.\
+  \ It is aimed at preventing the financial power of the developed countries being\
+  \ used in such a way as to impoverish the less developed.\nNon-alignment, as practiced\
+  \ by Ghana and many other countries, is based on co-operation with all States whether\
+  \ they be capitalist, socialist or have a mixed economy. Such a policy, therefore,\
+  \ involves foreign investment from capitalist countries, but it must be invested\
+  \ in accordance with a national plan drawn up by the government of the non-aligned\
+  \ State with its own interests in mind. The issue is not what return the foreign\
+  \ investor receives on his investments…The question is one of power. A State in\
+  \ the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah,\
+  \ Neo-Colonialism, 1965\nWhich of the following provides the best context for Nkrumah's\
+  \ writings?\n(A) The Industrial Revolution (B) Decolonization (C) Regional Free\
+  \ Trade Associations (D) Autarky\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on world history for help. The passage expresses a point that the successful\
+  \ fight against neo-colonialism were in danger and the newly independent nations\
+  \ like Ghana may be re-colonized via financial power of the developed countries.\
+  \ The answer is (B).\n\nQ: This question refers to the following information.\n\"\
+  Indeed, as both the fatwas of distinguished [scholars] who base their opinion on\
+  \ reason and tradition alike and the consensus of the Sunni community agree that\
+  \ the ancient obligation of extirpation, extermination, and expulsion of evil innovation\
+  \ must be the aim of our exalted aspiration, for \"Religious zeal is a victory for\
+  \ the Faith of God the Beneficent\"; then, in accordance with the words of the Prophet\
+  \ (Peace upon him!) \"Whosoever introduces evil innovation into our order must be\
+  \ expelled\" and \"Whosoever does aught against our order must be expelled,\" action\
+  \ has become necessary and exigent…\"\nLetter from Ottoman Sultan Selim I to Safavid\
+  \ Shah Ismail I, 1514\nThe letter from Selim I is most clearly an example of which\
+  \ of the following?\n(A) The maintenance of military supremacy at all costs (B)\
+  \ Expanding tensions between religious sects (C) Factors that brought about the\
+  \ collapse of the Ottoman Empire (D) Peacemaking efforts among the Islamic empires\n\
+  A: Let's think step by step. We refer to Wikipedia articles on world history for\
+  \ help. The passage is an example of expanding tensions between Selim and Ismail.\
+  \ In the passage the Selim references the fatwa and the consensus of the Sunni community\
+  \ to against whosoever introduces evil. The answer is (B).\n\nQ: This question refers\
+  \ to the following information.\n\"The real grievance of the worker is the insecurity\
+  \ of his existence; he is not sure that he will always have work, he is not sure\
+  \ that he will always be healthy, and he foresees that he will one day be old and\
+  \ unfit to work. If he falls into poverty, even if only through a prolonged illness,\
+  \ he is then completely helpless, exam_ins to his own devices, and society does\
+  \ not currently recognize any real obligation towards him beyond the usual help\
+  \ for the poor, even if he has been working all the time ever so faithfully and\
+  \ diligently. The usual help for the poor, however, leaves a lot to be desired,\
+  \ especially in large cities, where it is very much worse than in the country.\"\
+  \nOtto von Bismarck, 1884\nOtto von Bismarck likely made this speech in reaction\
+  \ to which of the following issues?\n(A) Social acceptance of child labor (B) Declining\
+  \ life expectancy in Germany (C) Criticisms of German trade tariffs (D) Negative\
+  \ effects attributed to industrial capitalism\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on world history for help. The passage talks about the grievance\
+  \ of the work under the industrial capitalism. The answer is (D).\n\nQ: This question\
+  \ refers to the following information.\nHe contains all works and desires and all\
+  \ perfumes and all tastes. He enfolds the whole universe and in silence is loving\
+  \ to all. This is the Spirit that is in my heart, this is Brahman. To him I shall\
+  \ come when I go beyond this life, and to him will come he who has faith and doubts\
+  \ not.\n—The Upanishads, India, c. 1000 BCE\nTo which religion does the speaker\
+  \ most likely belong?\n(A) Hinduism (B) Buddhism (C) Shintoism (D) Zoroastrianism\n\
+  A: Let's think step by step. We refer to Wikipedia articles on world history for\
+  \ help. Brahman refers to the ultimate reality of all things in the Hindu religion.\
+  \ In contrast, Buddhism does not have a concept of supreme God. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
index 9d652132..3d1f5971 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
@@ -1,48 +1,28 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
-
-
-  Q: All other things being equal, which of the following persons is more likely to
-  show osteoporosis?
-
-  (A) An older Hispanic American woman (B) An older African American woman (C) An
-  older Asian American woman (D) An older Native American woman
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Although osteoporosis can occur at any age, the risk is higher for older people.
-  It is most common in Asian and non-Hispanic white women. The answer is (C).
-
-
-  Q: The finding that adults tend to remember events from their adolescence better
-  than from other periods in their lives is referred to as the
-
-  (A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic
-  retrieval spike
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Reminiscence bump is a phenomenon that older adults tend to recollect events
-  during their young ages. People usually have a period of childhood amnesia from
-  birth to around age 5, and a reminiscence bump between 10 and 30. The answer is
-  (B).
-
-
-  Q: Which element in tobacco smoke is responsible for cancers?
-
-  (A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and
-  cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).
-
-
-  Q: When older adults move to a new state after retirement, which of the following
-  is the more likely destination?
-
-  (A) Texas (B) California (C) Hawaii (D) Vermont
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Texas does not have state tax, and has low cost of living compared with the
-  other three options. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_human_aging
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\nQ: All other things being equal, which of the following persons is more\
+  \ likely to show osteoporosis?\n(A) An older Hispanic American woman (B) An older\
+  \ African American woman (C) An older Asian American woman (D) An older Native American\
+  \ woman\nA: Let's think step by step. We refer to Wikipedia articles on human aging\
+  \ for help. Although osteoporosis can occur at any age, the risk is higher for older\
+  \ people. It is most common in Asian and non-Hispanic white women. The answer is\
+  \ (C).\n\nQ: The finding that adults tend to remember events from their adolescence\
+  \ better than from other periods in their lives is referred to as the\n(A) Adolescence\
+  \ advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic retrieval\
+  \ spike\nA: Let's think step by step. We refer to Wikipedia articles on human aging\
+  \ for help. Reminiscence bump is a phenomenon that older adults tend to recollect\
+  \ events during their young ages. People usually have a period of childhood amnesia\
+  \ from birth to around age 5, and a reminiscence bump between 10 and 30. The answer\
+  \ is (B).\n\nQ: Which element in tobacco smoke is responsible for cancers?\n(A)\
+  \ Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on human aging for help. The benzene,\
+  \ acrylamide and acrylonitrile in tar interact with the lungs and cause DNA mutations\
+  \ in cells of the lungs, and lead to cancer. The answer is (B).\n\nQ: When older\
+  \ adults move to a new state after retirement, which of the following is the more\
+  \ likely destination?\n(A) Texas (B) California (C) Hawaii (D) Vermont\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on human aging for help. Texas\
+  \ does not have state tax, and has low cost of living compared with the other three\
+  \ options. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
index 6b7a12cc..68a84092 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
@@ -1,61 +1,34 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
-
-
-  Q: The most common disorder among men who seek sexual therapy is:
-
-  (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory
-  disorder
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. The most common type of arousal disorder for men is erectile dysfunction,
-  meaning that a person is not able to get any physical satisfaction from sexual activity
-  although he may be interested in it. The answer is (C).
-
-
-  Q: A woman who knows she has active herpes and untreated syphilis but continues
-  to have sex without informing her partners of her condition has, in psychoanalytic
-  terms:
-
-  (A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. A person with weak superego tends to be delinquent, criminal or have antisocial
-  personality. The action of the woman who knows she has active venereal disease but
-  still have sex with her partners indicate she may has antisocial personality. The
-  answer is (B).
-
-
-  Q: The nature of homosexual activities that occur during preadolescence include
-  all but which of the following?
-
-  (A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other''s
-  genitals
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. Sexual intercourse is prohibited during preadolescence. The answer is
-  (A).
-
-
-  Q: Women''s ability to have multiple orgasms is primarily due to:
-
-  (A) the fact that they do not have a refractory period. (B) the response of the
-  inner layers of the vagina. (C) having alternating orgasms in different locations.
-  (D) the G-Spot.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. The refractory period is the time when a person is not able to be erect
-  or is not interested in sex. The answer is (A).
-
-
-  Q: Morning sickness is typically a problem:
-
-  (A) during the first trimester (B) during the second trimester (C) during the third
-  trimester (D) all through the pregnancy
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. Morning sickness usually begins by nine weeks after conception, corresponding
-  to the first trimester. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_human_sexuality
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\nQ: The most common disorder among men who seek sexual therapy is:\n\
+  (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory\
+  \ disorder\nA: Let's think step by step. We refer to Wikipedia articles on human\
+  \ sexuality for help. The most common type of arousal disorder for men is erectile\
+  \ dysfunction, meaning that a person is not able to get any physical satisfaction\
+  \ from sexual activity although he may be interested in it. The answer is (C).\n\
+  \nQ: A woman who knows she has active herpes and untreated syphilis but continues\
+  \ to have sex without informing her partners of her condition has, in psychoanalytic\
+  \ terms:\n(A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego\n\
+  A: Let's think step by step. We refer to Wikipedia articles on human sexuality for\
+  \ help. A person with weak superego tends to be delinquent, criminal or have antisocial\
+  \ personality. The action of the woman who knows she has active venereal disease\
+  \ but still have sex with her partners indicate she may has antisocial personality.\
+  \ The answer is (B).\n\nQ: The nature of homosexual activities that occur during\
+  \ preadolescence include all but which of the following?\n(A) sexual intercourse\
+  \ (B) circle jerks (C) exhibitionism (D) touching each other's genitals\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on human sexuality for help.\
+  \ Sexual intercourse is prohibited during preadolescence. The answer is (A).\n\n\
+  Q: Women's ability to have multiple orgasms is primarily due to:\n(A) the fact that\
+  \ they do not have a refractory period. (B) the response of the inner layers of\
+  \ the vagina. (C) having alternating orgasms in different locations. (D) the G-Spot.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on human sexuality for\
+  \ help. The refractory period is the time when a person is not able to be erect\
+  \ or is not interested in sex. The answer is (A).\n\nQ: Morning sickness is typically\
+  \ a problem:\n(A) during the first trimester (B) during the second trimester (C)\
+  \ during the third trimester (D) all through the pregnancy\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on human sexuality for help. Morning sickness\
+  \ usually begins by nine weeks after conception, corresponding to the first trimester.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
index 655a39e8..31d87667 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
@@ -1,80 +1,54 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
-
-
-  Q: How the consent to be bound of a State may be expressed?
-
-  (A) The consent of a State to be bound is expressed only by ratification (B) The
-  consent of a state to be bound by a treaty may be expressed by signature, ratification,
-  acceptance, approval or accession (C) The consent of a State to be bound is expressed
-  by signature (D) The consent of a State to be bound is expressed by whatever means
-  they choose
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969
-  states that "the consent of a State to be bound by a treaty may be expressed by
-  signature, exchange of instruments constituting a treaty, ratification, acceptance,
-  approval or accession, or by any other means if so agreed." (B) is the most precise
-  and accurate answer. The answer is (B).
-
-
-  Q: What is the judge ad hoc?
-
-  (A) If a party to a contentious case before the ICJ does not have a national sitting
-  as judge, it is entitled to nominate someone as a judge solely for that case, with
-  the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ
-  with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified
-  or passes away (D) Judge ad hoc is the judge that each party will always nominate
-  in every contentious case
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. As "ad hoc" implies, a judge ad hoc is appointed only for a specific case
-  or period, when a party to a contentious case before the International Court of
-  Justice does not have a regular national sitting as judge. The answer is (A).
-
-
-  Q: When ''consent'' can serve as a circumstance precluding the wrongfulness of a
-  State conduct?
-
-  (A) Consent can serve as a circumstance precluding the wrongfulness whenever it
-  is given (B) Consent can never serve as a circumstance precluding wrongfulness (C)
-  Consent can serve as a circumstance precluding wrongfulness, provided the consent
-  is valid and to the extent that the conduct remains within the limits of the consent
-  given (D) Consent can always serve as a circumstance precluding wrongfulness, no
-  matter which organ of the State gives it
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Valid consent can serve as a circumstance precluding the wrongfulness
-  of a State conduct if the conduct remains within the limits of that consent, according
-  to Chapter V of the Responsibility of States for Internationally Wrongful Acts,
-  2001, United Nations. The answer is (C).
-
-
-  Q: Would a reservation to the definition of torture in the ICCPR be acceptable in
-  contemporary practice?
-
-  (A) This is an acceptable reservation if the reserving country''s legislation employs
-  a different definition (B) This is an unacceptable reservation because it contravenes
-  the object and purpose of the ICCPR (C) This is an unacceptable reservation because
-  the definition of torture in the ICCPR is consistent with customary international
-  law (D) This is an acceptable reservation because under general international law
-  States have the right to enter reservations to treaties
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable
-  reservation in contemporary practice. The answer is (B).
-
-
-  Q: What types of force does Article 2(4) of the UN Charter prohibit?
-
-  (A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types
-  of force, including sanctions (C) Article 2(4) encompasses all interference in the
-  domestic affairs of States (D) Article 2(4) encompasses force directed only against
-  a State''s territorial integrity
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Article 2(4) of the UN Charter prohibits states from using armed forces
-  in their international relations. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_international_law
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\nQ: How the consent to be bound of a State may be expressed?\n(A) The consent\
+  \ of a State to be bound is expressed only by ratification (B) The consent of a\
+  \ state to be bound by a treaty may be expressed by signature, ratification, acceptance,\
+  \ approval or accession (C) The consent of a State to be bound is expressed by signature\
+  \ (D) The consent of a State to be bound is expressed by whatever means they choose\n\
+  A: Let's think step by step. We refer to Wikipedia articles on international law\
+  \ for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969\
+  \ states that \"the consent of a State to be bound by a treaty may be expressed\
+  \ by signature, exchange of instruments constituting a treaty, ratification, acceptance,\
+  \ approval or accession, or by any other means if so agreed.\" (B) is the most precise\
+  \ and accurate answer. The answer is (B).\n\nQ: What is the judge ad hoc?\n(A) If\
+  \ a party to a contentious case before the ICJ does not have a national sitting\
+  \ as judge, it is entitled to nominate someone as a judge solely for that case,\
+  \ with the title of judge ad hoc (B) Judge ad hoc is the member of the bench of\
+  \ the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge\
+  \ is disqualified or passes away (D) Judge ad hoc is the judge that each party will\
+  \ always nominate in every contentious case\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on international law for help. As \"ad hoc\" implies, a\
+  \ judge ad hoc is appointed only for a specific case or period, when a party to\
+  \ a contentious case before the International Court of Justice does not have a regular\
+  \ national sitting as judge. The answer is (A).\n\nQ: When 'consent' can serve as\
+  \ a circumstance precluding the wrongfulness of a State conduct?\n(A) Consent can\
+  \ serve as a circumstance precluding the wrongfulness whenever it is given (B) Consent\
+  \ can never serve as a circumstance precluding wrongfulness (C) Consent can serve\
+  \ as a circumstance precluding wrongfulness, provided the consent is valid and to\
+  \ the extent that the conduct remains within the limits of the consent given (D)\
+  \ Consent can always serve as a circumstance precluding wrongfulness, no matter\
+  \ which organ of the State gives it\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on international law for help. Valid consent can serve as a circumstance\
+  \ precluding the wrongfulness of a State conduct if the conduct remains within the\
+  \ limits of that consent, according to Chapter V of the Responsibility of States\
+  \ for Internationally Wrongful Acts, 2001, United Nations. The answer is (C).\n\n\
+  Q: Would a reservation to the definition of torture in the ICCPR be acceptable in\
+  \ contemporary practice?\n(A) This is an acceptable reservation if the reserving\
+  \ country's legislation employs a different definition (B) This is an unacceptable\
+  \ reservation because it contravenes the object and purpose of the ICCPR (C) This\
+  \ is an unacceptable reservation because the definition of torture in the ICCPR\
+  \ is consistent with customary international law (D) This is an acceptable reservation\
+  \ because under general international law States have the right to enter reservations\
+  \ to treaties\nA: Let's think step by step. We refer to Wikipedia articles on international\
+  \ law for help. For it contravenes the object and purpose of the ICCPR, this is\
+  \ an unacceptable reservation in contemporary practice. The answer is (B).\n\nQ:\
+  \ What types of force does Article 2(4) of the UN Charter prohibit?\n(A) Article\
+  \ 2(4) encompasses only armed force (B) Article 2(4) encompasses all types of force,\
+  \ including sanctions (C) Article 2(4) encompasses all interference in the domestic\
+  \ affairs of States (D) Article 2(4) encompasses force directed only against a State's\
+  \ territorial integrity\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on international law for help. Article 2(4) of the UN Charter prohibits states\
+  \ from using armed forces in their international relations. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
index 7e11f0f7..fa354238 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
@@ -1,69 +1,45 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  Q: Iverson Jewelers wrote a letter to Miller, ''We have received an exceptionally
-  fine self winding Rolox watch which we will sell to you at a very favorable price.''
-
-  (A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C)
-  The letter contains a valid offer which will terminate within a reasonable time.
-  (D) The letter lacks one of the essential elements of an offer.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. An offer shows the intent to enter into a mutually-beneficial contract with
-  specific terms. An offer can be made by a letter. While this letter indicates the
-  willingness to sell, the lack of specific terms, such as transaction price and offer
-  expiration date, makes it an incomplete offer. The answer is (D).
-
-
-  Q: Functions of the law include all but which of the following?
-
-  (A) maximizing individual freedom (B) providing a basis for compromise (C) keeping
-  the peace (D) promoting the principles of the free enterprise system
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Laws are fundamentally about helping resolve disputes between individuals,
-  and therefore essential for maximizing individual freedom, providing a basis for
-  compromise, and keeping the peace. The answer is (D).
-
-
-  Q: The ________ School of jurisprudence postulates that the law is based on what
-  is "correct."
-
-  (A) Natural Law (B) Analytical (C) Historical (D) Sociological
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Natural Law School of jurisprudence focuses on the laws of nature, and states
-  that the law should be based on ethics, morals, and what is "correct". Analytical
-  deals with the law as it already exists, Historical postulates that the law was
-  found and not made, and Sociological studies how the law and society impact each
-  other. The answer is (A).
-
-
-  Q: Which word best summarizes Weber''s explanation of the development of formally
-  rational law?
-
-  (A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Weber explained the development of formal rationality in laws as how the modern
-  society moved from tradition to rationality, where people decide actions based less
-  on how they were culturally done and more on expected utilities. How rational individuals
-  optimize efficiency of accomplishing tasks for higher rewards is a core principle
-  of Capitalism. The answer is (D).
-
-
-  Q: Which position does Rawls claim is the least likely to be adopted by the POP
-  (people in the original position)?
-
-  (A) The POP would choose equality above liberty. (B) The POP would opt for the ''maximin''
-  strategy. (C) The POP would opt for the ''difference principle''. (D) The POP would
-  reject the ''system of natural liberty.''
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. The POP would opt for the ''maximin'' strategy, opt for the ''difference principle'',
-  and reject the ''system of natural liberty'', but the POP would not choose equality
-  above liberty, since the POP assume both equal and free citizens. The answer is
-  (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_jurisprudence
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \nQ: Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally\
+  \ fine self winding Rolox watch which we will sell to you at a very favorable price.'\n\
+  (A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C)\
+  \ The letter contains a valid offer which will terminate within a reasonable time.\
+  \ (D) The letter lacks one of the essential elements of an offer.\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on jurisprudence for help. An offer\
+  \ shows the intent to enter into a mutually-beneficial contract with specific terms.\
+  \ An offer can be made by a letter. While this letter indicates the willingness\
+  \ to sell, the lack of specific terms, such as transaction price and offer expiration\
+  \ date, makes it an incomplete offer. The answer is (D).\n\nQ: Functions of the\
+  \ law include all but which of the following?\n(A) maximizing individual freedom\
+  \ (B) providing a basis for compromise (C) keeping the peace (D) promoting the principles\
+  \ of the free enterprise system\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on jurisprudence for help. Laws are fundamentally about helping resolve\
+  \ disputes between individuals, and therefore essential for maximizing individual\
+  \ freedom, providing a basis for compromise, and keeping the peace. The answer is\
+  \ (D).\n\nQ: The ________ School of jurisprudence postulates that the law is based\
+  \ on what is \"correct.\"\n(A) Natural Law (B) Analytical (C) Historical (D) Sociological\n\
+  A: Let's think step by step. We refer to Wikipedia articles on jurisprudence for\
+  \ help. Natural Law School of jurisprudence focuses on the laws of nature, and states\
+  \ that the law should be based on ethics, morals, and what is \"correct\". Analytical\
+  \ deals with the law as it already exists, Historical postulates that the law was\
+  \ found and not made, and Sociological studies how the law and society impact each\
+  \ other. The answer is (A).\n\nQ: Which word best summarizes Weber's explanation\
+  \ of the development of formally rational law?\n(A) Authority. (B) Charisma. (C)\
+  \ Co-operation. (D) Capitalism.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on jurisprudence for help. Weber explained the development of formal\
+  \ rationality in laws as how the modern society moved from tradition to rationality,\
+  \ where people decide actions based less on how they were culturally done and more\
+  \ on expected utilities. How rational individuals optimize efficiency of accomplishing\
+  \ tasks for higher rewards is a core principle of Capitalism. The answer is (D).\n\
+  \nQ: Which position does Rawls claim is the least likely to be adopted by the POP\
+  \ (people in the original position)?\n(A) The POP would choose equality above liberty.\
+  \ (B) The POP would opt for the 'maximin' strategy. (C) The POP would opt for the\
+  \ 'difference principle'. (D) The POP would reject the 'system of natural liberty.'\n\
+  A: Let's think step by step. We refer to Wikipedia articles on jurisprudence for\
+  \ help. The POP would opt for the 'maximin' strategy, opt for the 'difference principle',\
+  \ and reject the 'system of natural liberty', but the POP would not choose equality\
+  \ above liberty, since the POP assume both equal and free citizens. The answer is\
+  \ (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
index f6f3c359..c6251e67 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
@@ -1,71 +1,45 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
-
-
-  Q: When an arguer causes confusion during refutation because of real or feigned
-  lack of an ability to engage in refutation, that arguer may have committed the fallacy
-  of
-
-  (A) poor sportsmanship (B) appeal to compassion (C) argument against the person
-  (D) ignorance of refutation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. Ignorance of refutation, one of Aristotle''s original list of logical
-  fallacies in his Organon, is when someone causes confusion in an argument through
-  real or feigned inability to engage in refutation, in order to win the argument.
-  The answer is (D).
-
-
-  Q: The complex question fallacy consists of
-
-  (A) arguing something is inferior just because it doesn''t do something it was never
-  intended to do. (B) including more than one claim in the proposition and treating
-  proof for one claim as proof for all the claims. (C) drawing a conclusion before
-  examining the evidence, and only considering evidence that supports that conclusion.
-  (D) asking a question that includes either an unproven assumption or more than one
-  question, thus making a straightforward yes or no answer meaningless.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. The complex question fallacy is when someone makes a single yes or no
-  answer to a question meaningless, by including either an unproven assumption or
-  many questions. The latter is also known as the many questions fallacy. The answer
-  is (D).
-
-
-  Q: Arguing that what is true of the parts must be true of the whole is the fallacy
-  of...
-
-  (A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. Fallacy of composition occurs when someone argues what is true of the
-  parts must be true of the whole. The answer is (B).
-
-
-  Q: Which of the following is true of a valid categorical syllogism?
-
-  (A) The minor premise must deny the antecedent (B) The major premise must affirm
-  the consequent (C) The middle term must be used in at least one premise in a universal
-  or unqualified sense (D) All of the above
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. A valid categorical syllogism must satisfy several conditions: (1) the
-  syllogism must have exactly three terms (2) every term of the syllogism must be
-  used twice exactly, (3) a term may be used only once in any premise, and (4) the
-  middle term must be used in at least one premise in a universal or unqualified sense,
-  etc. Only (C) is true. The answer is (C).
-
-
-  Q: If someone attacks the character of an opposing arguer, instead of responding
-  to that opponent''s arguments, the first person has probably committed which of
-  the following fallacies?
-
-  (A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. The argument against the person fallacy occurs when someone irrelevantly
-  attacks the character of an opposing arguer, instead of addressing that opponent''s
-  arguments. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_logical_fallacies
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\nQ: When an arguer causes confusion during refutation because of\
+  \ real or feigned lack of an ability to engage in refutation, that arguer may have\
+  \ committed the fallacy of\n(A) poor sportsmanship (B) appeal to compassion (C)\
+  \ argument against the person (D) ignorance of refutation\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on logical fallacies for help. Ignorance\
+  \ of refutation, one of Aristotle's original list of logical fallacies in his Organon,\
+  \ is when someone causes confusion in an argument through real or feigned inability\
+  \ to engage in refutation, in order to win the argument. The answer is (D).\n\n\
+  Q: The complex question fallacy consists of\n(A) arguing something is inferior just\
+  \ because it doesn't do something it was never intended to do. (B) including more\
+  \ than one claim in the proposition and treating proof for one claim as proof for\
+  \ all the claims. (C) drawing a conclusion before examining the evidence, and only\
+  \ considering evidence that supports that conclusion. (D) asking a question that\
+  \ includes either an unproven assumption or more than one question, thus making\
+  \ a straightforward yes or no answer meaningless.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on logical fallacies for help. The complex question\
+  \ fallacy is when someone makes a single yes or no answer to a question meaningless,\
+  \ by including either an unproven assumption or many questions. The latter is also\
+  \ known as the many questions fallacy. The answer is (D).\n\nQ: Arguing that what\
+  \ is true of the parts must be true of the whole is the fallacy of...\n(A) Division\
+  \ (B) Composition (C) Appeal to the person (D) Appeal to ignorance\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on logical fallacies for help. Fallacy\
+  \ of composition occurs when someone argues what is true of the parts must be true\
+  \ of the whole. The answer is (B).\n\nQ: Which of the following is true of a valid\
+  \ categorical syllogism?\n(A) The minor premise must deny the antecedent (B) The\
+  \ major premise must affirm the consequent (C) The middle term must be used in at\
+  \ least one premise in a universal or unqualified sense (D) All of the above\nA:\
+  \ Let's think step by step. We refer to Wikipedia articles on logical fallacies\
+  \ for help. A valid categorical syllogism must satisfy several conditions: (1) the\
+  \ syllogism must have exactly three terms (2) every term of the syllogism must be\
+  \ used twice exactly, (3) a term may be used only once in any premise, and (4) the\
+  \ middle term must be used in at least one premise in a universal or unqualified\
+  \ sense, etc. Only (C) is true. The answer is (C).\n\nQ: If someone attacks the\
+  \ character of an opposing arguer, instead of responding to that opponent's arguments,\
+  \ the first person has probably committed which of the following fallacies?\n(A)\
+  \ tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi\n\
+  A: Let's think step by step. We refer to Wikipedia articles on logical fallacies\
+  \ for help. The argument against the person fallacy occurs when someone irrelevantly\
+  \ attacks the character of an opposing arguer, instead of addressing that opponent's\
+  \ arguments. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
index 1856af53..3a99b908 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
@@ -1,5 +1,5 @@
-dataset_name: machine_learning
-description: "The following are multiple choice questions (with answers) about machine\
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\nQ: Which image data augmentation is most common for natural images?\n\
   (A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization\
   \ (D) dithering\nA: Let's think step by step. Data augmentation is used to increase\
@@ -12,48 +12,49 @@ description: "The following are multiple choice questions (with answers) about m
   \ learning we consider a binary split according to whether the attribute is above\
   \ or below some threshold. Pat suggests that instead we should just have a multiway\
   \ split with one branch for each of the distinct values of the attribute. From the\
-  \ list below choose the single biggest problem with Pat\u2019s suggestion:\n(A)\
-  \ It is too computationally expensive. (B) It would probably result in a decision\
-  \ tree that scores badly on the training set and a testset. (C) It would probably\
-  \ result in a decision tree that scores well on the training set but badly on a\
-  \ testset. (D) It would probably result in a decision tree that scores well on a\
-  \ testset but badly on a training set.\nA: Let's think step by step. Because the\
-  \ input is real valued, it is unlikely that the same values appear both at training\
-  \ and test time. This means that while such a decision tree could yield good performance\
+  \ list below choose the single biggest problem with Pat’s suggestion:\n(A) It is\
+  \ too computationally expensive. (B) It would probably result in a decision tree\
+  \ that scores badly on the training set and a testset. (C) It would probably result\
+  \ in a decision tree that scores well on the training set but badly on a testset.\
+  \ (D) It would probably result in a decision tree that scores well on a testset\
+  \ but badly on a training set.\nA: Let's think step by step. Because the input is\
+  \ real valued, it is unlikely that the same values appear both at training and test\
+  \ time. This means that while such a decision tree could yield good performance\
   \ on the training data, when evaluated on the test data it will perform badly because\
-  \ the decision tree won\u2019t know what to do with numbers that did not appear\
-  \ in the training data. The answer is (C).\n\nQ: You are reviewing papers for the\
-  \ World\u2019s Fanciest Machine Learning Conference, and you see submissions with\
-  \ the following claims. Which ones would you consider accepting?\n(A) My method\
-  \ achieves a training error lower than all previous methods! (B) My method achieves\
-  \ a test error lower than all previous methods! (Footnote: When regularisation parameter\
-  \ \u03BB is chosen so as to minimise test error.) (C) My method achieves a test\
+  \ the decision tree won’t know what to do with numbers that did not appear in the\
+  \ training data. The answer is (C).\n\nQ: You are reviewing papers for the World’s\
+  \ Fanciest Machine Learning Conference, and you see submissions with the following\
+  \ claims. Which ones would you consider accepting?\n(A) My method achieves a training\
+  \ error lower than all previous methods! (B) My method achieves a test error lower\
+  \ than all previous methods! (Footnote: When regularisation parameter λ is chosen\
+  \ so as to minimise test error.) (C) My method achieves a test error lower than\
+  \ all previous methods! (Footnote: When regularisation parameter λ is chosen so\
+  \ as to minimise cross-validaton error.) (D) My method achieves a cross-validation\
   \ error lower than all previous methods! (Footnote: When regularisation parameter\
-  \ \u03BB is chosen so as to minimise cross-validaton error.) (D) My method achieves\
-  \ a cross-validation error lower than all previous methods! (Footnote: When regularisation\
-  \ parameter \u03BB is chosen so as to minimise cross-validaton error.)\nA: Let's\
-  \ think step by step. In machine learning, we train with some data and fixed hyperparameters\
-  \ and the training error can be arbitrarily low, so (A) can\u2019t be right. Then,\
-  \ one compares different hyperparameters by selecting the model with the lowest\
-  \ cross-validation error, this means that (B) and (D) are not the right procedure.\
-  \ The only relevant number after these is the test error and thus (C) is the right\
-  \ answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results\
-  \ are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times;\
-  \ side 5: 4 times; side 6: 5 times. Based on these results, what is the probability\
-  \ of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16\
-  \ (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one\
-  \ to the different counts and then normalizes the probabilities accordingly. The\
-  \ counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side\
-  \ 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum\
-  \ one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7.\
-  \ The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent\
-  \ of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality\
-  \ the IID test set must have how many examples?\n(A) around 10 examples (B) around\
-  \ 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA:\
-  \ Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with\
-  \ 95% probability the in-sample and out-of-sample errors differ by epsilon when\
-  \ we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2)\
-  \ log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than\
-  \ 1000 examples. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_machine_learning
+  \ λ is chosen so as to minimise cross-validaton error.)\nA: Let's think step by\
+  \ step. In machine learning, we train with some data and fixed hyperparameters and\
+  \ the training error can be arbitrarily low, so (A) can’t be right. Then, one compares\
+  \ different hyperparameters by selecting the model with the lowest cross-validation\
+  \ error, this means that (B) and (D) are not the right procedure. The only relevant\
+  \ number after these is the test error and thus (C) is the right answer. The answer\
+  \ is (C).\n\nQ: A 6-sided die is rolled 15 times and the results are: side 1 comes\
+  \ up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times;\
+  \ side 6: 5 times. Based on these results, what is the probability of side 3 coming\
+  \ up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5\nA:\
+  \ Let's think step by step. Add-1 smoothing adds the value of one to the different\
+  \ counts and then normalizes the probabilities accordingly. The counts after adding\
+  \ one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3 times; side 4:\
+  \ 4 times; side 5: 5 times; side 6: 6 times. The number of sum one die rolls will\
+  \ be 21, so the probability of drawing a three is 3/21 = 1/7. The answer is (B).\n\
+  \nQ: To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1\
+  \ loss (with probability 95%), according to Hoeffding's inequality the IID test\
+  \ set must have how many examples?\n(A) around 10 examples (B) around 100 examples\
+  \ (C) between 100 and 500 examples (D) more than 1000 examples\nA: Let's think step\
+  \ by step. By the Hoeffding’s inequality, we expect that with 95% probability the\
+  \ in-sample and out-of-sample errors differ by epsilon when we have N samples if\
+  \ 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2) log ( 0.05/2\
+  \ )= log (40)*5000. Since log(40)>1, we have that one needs more than 1000 examples.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
index db2f9642..1259e076 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
@@ -1,54 +1,33 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  Q: How can organisational structures that are characterised by democratic and inclusive
-  styles of management be described?
-
-  (A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  Flat organizational structures are characterized by democratic and inclusive styles
-  of management, and have few (if any) levels of management between the workers and
-  managers.  The answer is (C).
-
-
-  Q: Hygiene factors are associated with which writer?
-
-  (A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  Hygiene factors include compensation, company policies, supervision, interpersonal
-  relations, and work environments. Hertzberg lists them as factors that cannot motivate
-  employees but can minimize job dissatisfaction. The answer is (A).
-
-
-  Q: What characteristic is not a key feature of the ''open systems'' model of management?
-
-  (A) Morale (B) Innovation (C) Growth resource (D) Adaptation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The key characteristics of an open system in management include innovation, growth
-  resource, and adaption, but do not include morale. The answer is (A).
-
-
-  Q: Which element of the cultural web forms regalia?
-
-  (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The cultural web is a tool for mapping an organization''s culture, where symbols
-  form the regalia that visually expresses the values that the organization holds
-  as important. The answer is (A).
-
-
-  Q: What are the two main dimensions of the Ohio Studies into leadership?
-
-  (A) Starting position and end position (B) Initial environment and changed environment
-  (C) Organisational structure and conditioning (D) Initiating structure and considerations
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The Ohio State Leadership Studies conducted in the 1940s identified initiating structure
-  and consideration as the two main dimensions of leader behavior. The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_management
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \nQ: How can organisational structures that are characterised by democratic and\
+  \ inclusive styles of management be described?\n(A) Hierarchical (B) Bureaucratic\
+  \ (C) Flat (D) Functional\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on management for help. Flat organizational structures are characterized by democratic\
+  \ and inclusive styles of management, and have few (if any) levels of management\
+  \ between the workers and managers.  The answer is (C).\n\nQ: Hygiene factors are\
+  \ associated with which writer?\n(A) Frederick Hertzberg (B) D.C. McClelland (C)\
+  \ Abraham Maslow (D) Douglas McGregor\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on management for help. Hygiene factors include compensation,\
+  \ company policies, supervision, interpersonal relations, and work environments.\
+  \ Hertzberg lists them as factors that cannot motivate employees but can minimize\
+  \ job dissatisfaction. The answer is (A).\n\nQ: What characteristic is not a key\
+  \ feature of the 'open systems' model of management?\n(A) Morale (B) Innovation\
+  \ (C) Growth resource (D) Adaptation\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on management for help. The key characteristics of an open system in\
+  \ management include innovation, growth resource, and adaption, but do not include\
+  \ morale. The answer is (A).\n\nQ: Which element of the cultural web forms regalia?\n\
+  (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems\n\
+  A: Let's think step by step. We refer to Wikipedia articles on management for help.\
+  \ The cultural web is a tool for mapping an organization's culture, where symbols\
+  \ form the regalia that visually expresses the values that the organization holds\
+  \ as important. The answer is (A).\n\nQ: What are the two main dimensions of the\
+  \ Ohio Studies into leadership?\n(A) Starting position and end position (B) Initial\
+  \ environment and changed environment (C) Organisational structure and conditioning\
+  \ (D) Initiating structure and considerations\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on management for help. The Ohio State Leadership Studies\
+  \ conducted in the 1940s identified initiating structure and consideration as the\
+  \ two main dimensions of leader behavior. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_management"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
index 5dd683da..d8a6b9b8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
@@ -1,66 +1,40 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  Q: Although the content and quality can be as controlled as direct mail, response
-  rates of this medium are lower because of the lack of a personal address mechanism.
-  This media format is known as:
-
-  (A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Door to door marketing delivers non-addressed items within all buildings within
-  a geographic area. While it can control the content and quality as well as direct
-  mail marketing, its response rate is lower because of the lack of a personal address
-  mechanism. The answer is (D).
-
-
-  Q: In an organization, the group of people tasked with buying decisions is referred
-  to as the _______________.
-
-  (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making
-  unit.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  In an organization, the group of the people tasked with buying decision is referred
-  to as the decision-making unit. The answer is (D).
-
-
-  Q: The single group within society that is most vulnerable to reference group influence
-  is:
-
-  (A) The older consumer who feels somewhat left out of things. (B) The married women,
-  many of whom feel a need for stability in their lives. (C) New immigrants who really
-  want to assimilate into their new culture. (D) Children, who base most of their
-  buying decisions on outside influences.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Children, who mostly based their buying decisions on outside influences, are the
-  single group within society that is more vulnerable to reference group influence.
-  The answer is (D).
-
-
-  Q: Which of the following is an assumption in Maslow''s hierarchy of needs?
-
-  (A) Needs are dependent on culture and also on social class. (B) Lower-level needs
-  must be at least partially satisfied before higher needs can affect behaviour. (C)
-  Needs are not prioritized or arranged in any particular order. (D) Satisfied needs
-  are motivators, and new needs emerge when current needs remain unmet.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Maslow''s hierarchy of needs, from the bottom upwards, are physiological (food and
-  clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level
-  needs must be at least partially satisfied before higher ones can affect behavior.
-  The answer is (B).
-
-
-  Q: _____________ is a natural outcome when combining demographic and geographic
-  variables.
-
-  (A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Geodemographics is a natural outcome when combining demographic and geographic variables.
-  The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_marketing
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \nQ: Although the content and quality can be as controlled as direct mail, response\
+  \ rates of this medium are lower because of the lack of a personal address mechanism.\
+  \ This media format is known as:\n(A) Care lines. (B) Direct mail. (C) Inserts.\
+  \ (D) Door to door.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on marketing for help. Door to door marketing delivers non-addressed items within\
+  \ all buildings within a geographic area. While it can control the content and quality\
+  \ as well as direct mail marketing, its response rate is lower because of the lack\
+  \ of a personal address mechanism. The answer is (D).\n\nQ: In an organization,\
+  \ the group of people tasked with buying decisions is referred to as the _______________.\n\
+  (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making\
+  \ unit.\nA: Let's think step by step. We refer to Wikipedia articles on marketing\
+  \ for help. In an organization, the group of the people tasked with buying decision\
+  \ is referred to as the decision-making unit. The answer is (D).\n\nQ: The single\
+  \ group within society that is most vulnerable to reference group influence is:\n\
+  (A) The older consumer who feels somewhat left out of things. (B) The married women,\
+  \ many of whom feel a need for stability in their lives. (C) New immigrants who\
+  \ really want to assimilate into their new culture. (D) Children, who base most\
+  \ of their buying decisions on outside influences.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on marketing for help. Children, who mostly based\
+  \ their buying decisions on outside influences, are the single group within society\
+  \ that is more vulnerable to reference group influence. The answer is (D).\n\nQ:\
+  \ Which of the following is an assumption in Maslow's hierarchy of needs?\n(A) Needs\
+  \ are dependent on culture and also on social class. (B) Lower-level needs must\
+  \ be at least partially satisfied before higher needs can affect behaviour. (C)\
+  \ Needs are not prioritized or arranged in any particular order. (D) Satisfied needs\
+  \ are motivators, and new needs emerge when current needs remain unmet.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on marketing for help. Maslow's\
+  \ hierarchy of needs, from the bottom upwards, are physiological (food and clothing),\
+  \ safety, love and belonging needs, esteem, and self-actualization. Lower-level\
+  \ needs must be at least partially satisfied before higher ones can affect behavior.\
+  \ The answer is (B).\n\nQ: _____________ is a natural outcome when combining demographic\
+  \ and geographic variables.\n(A) Geodemographics (B) Product differentiation. (C)\
+  \ ANSOFF matrix. (D) Brand management.\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on marketing for help. Geodemographics is a natural outcome\
+  \ when combining demographic and geographic variables. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
index ebf699aa..bf770592 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
@@ -1,61 +1,37 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
-
-
-  Q: The stage of meiosis in which chromosomes pair and cross over is:
-
-  (A) prophase I (B) metaphase I (C) prophase II (D) metaphase II
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Prophase I is the stage of meiosis where homologous chromosomes pair with
-  each other and exchange genetic material. The answer is (A).
-
-
-  Q: DNA ligase is
-
-  (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial
-  origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription
-  of specific genes (D) an enzyme which limits the level to which a particular nutrient
-  reaches
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA
-  strands together by catalyzing a phosphodiester bond. The answer is (A).
-
-
-  Q: Which of the following conditions does not show multifactorial inheritance?
-
-  (A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D)
-  Marfan syndrome
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Multifactorial inheritance is when more than a single factor is responsible
-  for causing a given trait or health problem. Genes cannot be the only factor. Marfan
-  syndrome, on the other hand, requires only one abnormal copy of the of the Marfan
-  gene, from one parent, to inherit the trait. The answer is (D).
-
-
-  Q: A gene showing codominance
-
-  (A) has both alleles independently expressed in the heterozygote (B) has one allele
-  dominant to the other (C) has alleles tightly linked on the same chromosome (D)
-  has alleles expressed at the same time in development
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance
-  where the phenotype of both the parents is easily observed in the offspring. A heterozygote
-  is an individual having two different alleles of a gene. The answer is (A).
-
-
-  Q: Large triplet repeat expansions can be detected by:
-
-  (A) polymerase chain reaction. (B) single strand conformational polymorphism analysis.
-  (C) Southern blotting. (D) Western blotting.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. A Southern blot is a method in molecular biology for detecting specific
-  DNA sequences in a sample. Large triplet repeat expansions are usually detected
-  with this method. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_medical_genetics
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\nQ: The stage of meiosis in which chromosomes pair and cross over\
+  \ is:\n(A) prophase I (B) metaphase I (C) prophase II (D) metaphase II\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on medical genetics for help.\
+  \ Prophase I is the stage of meiosis where homologous chromosomes pair with each\
+  \ other and exchange genetic material. The answer is (A).\n\nQ: DNA ligase is\n\
+  (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial\
+  \ origin which cuts DNA at defined base sequences (C) an enzyme that facilitates\
+  \ transcription of specific genes (D) an enzyme which limits the level to which\
+  \ a particular nutrient reaches\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. DNA ligase is a type of enzyme (EC 6.5.1.1)\
+  \ responsible for joining DNA strands together by catalyzing a phosphodiester bond.\
+  \ The answer is (A).\n\nQ: Which of the following conditions does not show multifactorial\
+  \ inheritance?\n(A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural\
+  \ tube defects) (D) Marfan syndrome\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. Multifactorial inheritance is when more\
+  \ than a single factor is responsible for causing a given trait or health problem.\
+  \ Genes cannot be the only factor. Marfan syndrome, on the other hand, requires\
+  \ only one abnormal copy of the of the Marfan gene, from one parent, to inherit\
+  \ the trait. The answer is (D).\n\nQ: A gene showing codominance\n(A) has both alleles\
+  \ independently expressed in the heterozygote (B) has one allele dominant to the\
+  \ other (C) has alleles tightly linked on the same chromosome (D) has alleles expressed\
+  \ at the same time in development\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. Codominance, as it relates to genetics,\
+  \ refers to a type of genetic inheritance where the phenotype of both the parents\
+  \ is easily observed in the offspring. A heterozygote is an individual having two\
+  \ different alleles of a gene. The answer is (A).\n\nQ: Large triplet repeat expansions\
+  \ can be detected by:\n(A) polymerase chain reaction. (B) single strand conformational\
+  \ polymorphism analysis. (C) Southern blotting. (D) Western blotting.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on medical genetics for help.\
+  \ A Southern blot is a method in molecular biology for detecting specific DNA sequences\
+  \ in a sample. Large triplet repeat expansions are usually detected with this method.\
+  \ The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
index a506e940..0075bd64 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
@@ -1,54 +1,27 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  Q: Which of these songs was a Top 10 hit for the rock band The Police?
-
-  (A) ''Radio Ga-Ga'' (B) ''Ob-la-di Ob-la-da'' (C) ''De Do Do Do De Da Da Da'' (D)
-  ''In-a-Gadda-Da-Vida''
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by
-  Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly.
-  Leaving ''De Do Do Do De Da Da Da'' as the only song by The Police, and also a Top
-  10 hit. The answer is (C).
-
-
-  Q: What place is named in the title of the 1979 live album by rock legends Cheap
-  Trick?
-
-  (A) Budapest (B) Budokan (C) Bhutan (D) Britain
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Nippon Budokan is
-  an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including
-  Cheap Trick in 1978. ''Cheap Trick at Budokan'' became the name of their album.
-  The answer is (B).
-
-
-  Q: What is produced during photosynthesis?
-
-  (A) hydrogen (B) nylon (C) oxygen (D) light
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Photosynthesis is
-  the process in which green plants use the green pigment chlorophyll to synthesize
-  foods with water and carbon dioxide. Oxygen is the byproduct of this process. The
-  answer is (C).
-
-
-  Q: Who is the shortest man to ever win an NBA slam dunk competition?
-
-  (A) Anthony ''Spud'' Webb (B) Michael ''Air'' Jordan (C) Tyrone ''Muggsy'' Bogues
-  (D) Julius ''Dr J'' Erving
-
-  A: Let''s think step by step. We refer to Wikipedia for help. In 1986, Spud Webb,
-  standing only 5''7" became the shortest NBA player in history to win an official
-  slam dunk contest. The answer is (A).
-
-
-  Q: How many axles does a standard automobile have?
-
-  (A) one (B) two (C) four (D) eight
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Most cars have two
-  axles to rotate the wheels.. The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_miscellaneous
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \nQ: Which of these songs was a Top 10 hit for the rock band The Police?\n(A) 'Radio\
+  \ Ga-Ga' (B) 'Ob-la-di Ob-la-da' (C) 'De Do Do Do De Da Da Da' (D) 'In-a-Gadda-Da-Vida'\n\
+  A: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by Queen.\
+  \ Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly.\
+  \ Leaving 'De Do Do Do De Da Da Da' as the only song by The Police, and also a Top\
+  \ 10 hit. The answer is (C).\n\nQ: What place is named in the title of the 1979\
+  \ live album by rock legends Cheap Trick?\n(A) Budapest (B) Budokan (C) Bhutan (D)\
+  \ Britain\nA: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan\
+  \ is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including\
+  \ Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album.\
+  \ The answer is (B).\n\nQ: What is produced during photosynthesis?\n(A) hydrogen\
+  \ (B) nylon (C) oxygen (D) light\nA: Let's think step by step. We refer to Wikipedia\
+  \ for help. Photosynthesis is the process in which green plants use the green pigment\
+  \ chlorophyll to synthesize foods with water and carbon dioxide. Oxygen is the byproduct\
+  \ of this process. The answer is (C).\n\nQ: Who is the shortest man to ever win\
+  \ an NBA slam dunk competition?\n(A) Anthony 'Spud' Webb (B) Michael 'Air' Jordan\
+  \ (C) Tyrone 'Muggsy' Bogues (D) Julius 'Dr J' Erving\nA: Let's think step by step.\
+  \ We refer to Wikipedia for help. In 1986, Spud Webb, standing only 5'7\" became\
+  \ the shortest NBA player in history to win an official slam dunk contest. The answer\
+  \ is (A).\n\nQ: How many axles does a standard automobile have?\n(A) one (B) two\
+  \ (C) four (D) eight\nA: Let's think step by step. We refer to Wikipedia for help.\
+  \ Most cars have two axles to rotate the wheels.. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
index fdde3633..e2c30192 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
@@ -1,72 +1,49 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
-
-
-  Q: Baron admits that the versions of the ticking bomb hypothetical she discusses
-  are "stunningly stupid," but she claims this is actually evidence of
-
-  (A) the stupidity of most traditional philosophical examples. (B) a general lack
-  of intelligence among people with advanced degrees. (C) the wrongness of torture.
-  (D) the readiness on the part of many intelligent people to see torture as the best
-  solution to deal with terrorism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. The ticking bomb hypothetical poses a problem where many people will die to
-  an exploding bomb, if the hypothetical terrorist does not disclose how to defuse
-  it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent
-  people, particularly utilitarians, see torture as justifiable to save the lives
-  in this scenario. The answer is (D).
-
-
-  Q: A fertilized ovum is also known as
-
-  (A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Once a single sperm penetrates the layers of an egg to form a new cell, that
-  cell is called a zygote. The answer is (A).
-
-
-  Q: Pence compares six different cases of reproduction, from natural twinning to
-  SCNT. What conclusion does he draw from this comparison?
-
-  (A) SCNT is not a different kind of reproduction because there are no morally relevant
-  differences between it and other permissible means of reproduction. (B) Because
-  there is a low risk of harm for natural twinning, there will be a low risk of harm
-  for SCNT. (C) both A and B (D) neither A nor B
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. SCNT, known as ''somatic cell nuclear transfer'', is a technique in which
-  the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated
-  egg. This is used in farm animal cloning. Pence may compare this to other processes
-  of reproduction, and see little moral separation. The answer is (A).
-
-
-  Q: Which of the following is an intrinsic good, according to Singer?
-
-  (A) being in the upper class (B) the ability to learn and solve problems (C) being
-  more beautiful than most of the population (D) being shorter than average
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by
-  oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable
-  criteria, but capabilities are intrinsically good. The answer is (B).
-
-
-  Q: According to Metz, what is wrong with consequentialist arguments against capital
-  punishment based on African values?
-
-  (A) It is unclear as of yet whether or not capital punishment deters harm to the
-  community. (B) It is unclear as of yet whether or not capital punishment deters
-  harm to any individuals. (C) Consequentialism is not supported by African values.
-  (D) Even though consequentialism is supported by African values, no consequentialist
-  arguments framed in terms of African values have been offered.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy
-  in South Africa. Metz has written the death penalty is unjustified as it treats
-  individuals as incapable of communal relations. It is unclear that capital punishment
-  is to the benefit of, or a deterrent of harm to the community. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_moral_disputes
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\nQ: Baron admits that the versions of the ticking bomb hypothetical\
+  \ she discusses are \"stunningly stupid,\" but she claims this is actually evidence\
+  \ of\n(A) the stupidity of most traditional philosophical examples. (B) a general\
+  \ lack of intelligence among people with advanced degrees. (C) the wrongness of\
+  \ torture. (D) the readiness on the part of many intelligent people to see torture\
+  \ as the best solution to deal with terrorism.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on moral disputes for help. The ticking bomb hypothetical\
+  \ poses a problem where many people will die to an exploding bomb, if the hypothetical\
+  \ terrorist does not disclose how to defuse it. Baron sees this hypothetical as\
+  \ silly, but its prevalence does suggest intelligent people, particularly utilitarians,\
+  \ see torture as justifiable to save the lives in this scenario. The answer is (D).\n\
+  \nQ: A fertilized ovum is also known as\n(A) a zygote. (B) an embryo. (C) a viability.\
+  \ (D) a blastocyst.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. Once a single sperm penetrates the layers of an egg\
+  \ to form a new cell, that cell is called a zygote. The answer is (A).\n\nQ: Pence\
+  \ compares six different cases of reproduction, from natural twinning to SCNT. What\
+  \ conclusion does he draw from this comparison?\n(A) SCNT is not a different kind\
+  \ of reproduction because there are no morally relevant differences between it and\
+  \ other permissible means of reproduction. (B) Because there is a low risk of harm\
+  \ for natural twinning, there will be a low risk of harm for SCNT. (C) both A and\
+  \ B (D) neither A nor B\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. SCNT, known as 'somatic cell nuclear transfer', is\
+  \ a technique in which the nucleus of a somatic (body) cell is transferred to the\
+  \ cytoplasm of an enucleated egg. This is used in farm animal cloning. Pence may\
+  \ compare this to other processes of reproduction, and see little moral separation.\
+  \ The answer is (A).\n\nQ: Which of the following is an intrinsic good, according\
+  \ to Singer?\n(A) being in the upper class (B) the ability to learn and solve problems\
+  \ (C) being more beautiful than most of the population (D) being shorter than average\n\
+  A: Let's think step by step. We refer to Wikipedia articles on moral disputes for\
+  \ help. Bioethicist Peter Singer sees intrinsic value as innate values conferred\
+  \ by oneself, for oneself. Innanimate objects can be beautiful, short, or have some\
+  \ valuable criteria, but capabilities are intrinsically good. The answer is (B).\n\
+  \nQ: According to Metz, what is wrong with consequentialist arguments against capital\
+  \ punishment based on African values?\n(A) It is unclear as of yet whether or not\
+  \ capital punishment deters harm to the community. (B) It is unclear as of yet whether\
+  \ or not capital punishment deters harm to any individuals. (C) Consequentialism\
+  \ is not supported by African values. (D) Even though consequentialism is supported\
+  \ by African values, no consequentialist arguments framed in terms of African values\
+  \ have been offered.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. Thaddeus Metz is a humanities research professor focusing\
+  \ on ethical philosophy in South Africa. Metz has written the death penalty is unjustified\
+  \ as it treats individuals as incapable of communal relations. It is unclear that\
+  \ capital punishment is to the benefit of, or a deterrent of harm to the community.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
index 4c82f5d5..cb2599e2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
@@ -1,66 +1,44 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario
-  2 | I made love to my newborn baby boy.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Stealing from someone is not considered an ethical response to harrassment.
-  And having sex with a minor is egregiously unethical. The answer is (A).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store.
-  Scenario 2 | When the security guard left his post, I pocketed a rare coin.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Asking for something is not morally wrong. Stealing is morally wrong.
-  The answer is (C).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | During the house''s demolition, I pushed him into a wall.
-  Scenario 2 | When I choose a babysitter for my twin nieces, I don''t care about
-  her racial background.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Physical violence is morally wrong. Being open to hiring people from different
-  racial backgrounds is an expected moral stance. The answer is (B).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I
-  looked up at the gorgeous waterfall and threw a rock at it.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Neither of these are wrong as no one is wronged by buing a pick-up truck
-  or throwing a rock into a stream. The answer is (D).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new
-  girl is embarrassed to live in a trailer park, so I invite the class to her house.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Loving someone is not wrong. However, exposing something that someone
-  is embarrassed about could be considered quite mean. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_moral_scenarios
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\nQ: For which of these two scenarios does the main character (who\
+  \ uses I/me/my) do something clearly morally wrong, according to ordinary moral\
+  \ standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed\
+  \ me at the bar. Scenario 2 | I made love to my newborn baby boy.\n(A) Wrong, Wrong\
+  \ (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on moral scenarios for help. Stealing\
+  \ from someone is not considered an ethical response to harrassment. And having\
+  \ sex with a minor is egregiously unethical. The answer is (A).\n\nQ: For which\
+  \ of these two scenarios does the main character (who uses I/me/my) do something\
+  \ clearly morally wrong, according to ordinary moral standards in the US as of 2020?\
+  \ Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario\
+  \ 2 | When the security guard left his post, I pocketed a rare coin.\n(A) Wrong,\
+  \ Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on moral scenarios for help.\
+  \ Asking for something is not morally wrong. Stealing is morally wrong. The answer\
+  \ is (C).\n\nQ: For which of these two scenarios does the main character (who uses\
+  \ I/me/my) do something clearly morally wrong, according to ordinary moral standards\
+  \ in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him\
+  \ into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't\
+  \ care about her racial background.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not\
+  \ wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on moral scenarios for help. Physical violence is morally wrong.\
+  \ Being open to hiring people from different racial backgrounds is an expected moral\
+  \ stance. The answer is (B).\n\nQ: For which of these two scenarios does the main\
+  \ character (who uses I/me/my) do something clearly morally wrong, according to\
+  \ ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up\
+  \ truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a\
+  \ rock at it.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not\
+  \ wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral scenarios for help. Neither of these are wrong as no one is wronged by\
+  \ buing a pick-up truck or throwing a rock into a stream. The answer is (D).\n\n\
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do\
+  \ something clearly morally wrong, according to ordinary moral standards in the\
+  \ US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The\
+  \ new girl is embarrassed to live in a trailer park, so I invite the class to her\
+  \ house.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong,\
+  \ Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral\
+  \ scenarios for help. Loving someone is not wrong. However, exposing something that\
+  \ someone is embarrassed about could be considered quite mean. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
index eae79250..8af8d29a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
@@ -1,72 +1,48 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  Q: What is the first-line drug for patients with type 2 diabetes and obesity, as
-  of 2020?
-
-  (A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed
-  for type 2 diabetes, as well as obesity. It works by lowering glucose production
-  in the liver and improving the body''s sensitivity to insulin. The answer is (B).
-
-
-  Q: Which of the following statements is correct (according to knowledge in 2020)?
-
-  (A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame
-  (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin
-  (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose
-  (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame
-  K
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine.
-  As it builds up in the blood and brain it can lead to brain damage. People with
-  PKU should avoid foods that are converted to phenylalanine in the body, such as
-  aspartame. The answer is (A).
-
-
-  Q: Which of the following statements about iodine is correct, as of 2020?
-
-  (A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are
-  a poor source of iodine (C) The iodine content of organic milk is generally lower
-  that the level in non-organic milk (D) UK dietary reference values recommend an
-  increase in iodine intake in pregnancy
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Organic milk usually has less iodine content than non-organic milk. The answer is
-  (C).
-
-
-  Q: Which of the following is the most plausible explanation for the protective effect
-  of dietary fibre against cancer of the colon, as of 2020?
-
-  (A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty
-  acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates
-  "silencing" of the SLC5A8 tumour suppressor gene (C) None of these options are correct
-  (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant
-  defences in the colon
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Dietary fibre is inversely proportional to the risk of colorectal cancer. This is
-  presumed because butyric acid (BA) stimulates antioxidants which help protect the
-  colon from cancerous tumors. The answer is (D).
-
-
-  Q: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing
-  consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which
-  one statement is correct (according to knowledge in 2020)?
-
-  (A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers
-  have higher risk of developing diabetes. (C) Based on the information given we cannot
-  tell if the observed difference in disease risk is the result of chance. (D) The
-  risk ratio is close to the value one, so there is no difference in disease risk
-  between the two groups.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  The risk ratio is not sufficiently reduced that it could not be explained by random
-  chance given the studies sample size. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_nutrition
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \nQ: What is the first-line drug for patients with type 2 diabetes and obesity,\
+  \ as of 2020?\n(A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on nutrition for help. Metformin\
+  \ (Fortamet, Glumetza, or others) is usually the first medication prescribed for\
+  \ type 2 diabetes, as well as obesity. It works by lowering glucose production in\
+  \ the liver and improving the body's sensitivity to insulin. The answer is (B).\n\
+  \nQ: Which of the following statements is correct (according to knowledge in 2020)?\n\
+  (A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame\
+  \ (B) Consumers with phenylketonuria must avoid the consumption of the sweetener\
+  \ saccharin (C) Consumers with phenylketonuria must avoid the consumption of the\
+  \ sweetener sucralose (D) Consumers with phenylketonuria must avoid the consumption\
+  \ of the sweetener acesulfame K\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on nutrition for help. People with phenylketonuria (PKU) cannot break\
+  \ down the amino acid phenylalanine. As it builds up in the blood and brain it can\
+  \ lead to brain damage. People with PKU should avoid foods that are converted to\
+  \ phenylalanine in the body, such as aspartame. The answer is (A).\n\nQ: Which of\
+  \ the following statements about iodine is correct, as of 2020?\n(A) 50% of adults\
+  \ consume iodine at levels below the RNI (B) Dairy products are a poor source of\
+  \ iodine (C) The iodine content of organic milk is generally lower that the level\
+  \ in non-organic milk (D) UK dietary reference values recommend an increase in iodine\
+  \ intake in pregnancy\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on nutrition for help. Organic milk usually has less iodine content than non-organic\
+  \ milk. The answer is (C).\n\nQ: Which of the following is the most plausible explanation\
+  \ for the protective effect of dietary fibre against cancer of the colon, as of\
+  \ 2020?\n(A) Propionic acid, formed during colonic fibre fermentation inhibits liver\
+  \ fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation\
+  \ stimulates \"silencing\" of the SLC5A8 tumour suppressor gene (C) None of these\
+  \ options are correct (D) Butyric acid, formed during colonic fibre fermentation\
+  \ stimulates anti-oxidant defences in the colon\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on nutrition for help. Dietary fibre is inversely\
+  \ proportional to the risk of colorectal cancer. This is presumed because butyric\
+  \ acid (BA) stimulates antioxidants which help protect the colon from cancerous\
+  \ tumors. The answer is (D).\n\nQ: In a cohort study, the risk ratio of developing\
+  \ diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did\
+  \ not drink tea (the unexposed). Which one statement is correct (according to knowledge\
+  \ in 2020)?\n(A) The tea drinkers have lower risk of developing diabetes. (B) The\
+  \ tea drinkers have higher risk of developing diabetes. (C) Based on the information\
+  \ given we cannot tell if the observed difference in disease risk is the result\
+  \ of chance. (D) The risk ratio is close to the value one, so there is no difference\
+  \ in disease risk between the two groups.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on nutrition for help. The risk ratio is not sufficiently\
+  \ reduced that it could not be explained by random chance given the studies sample\
+  \ size. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
index 60ce6c54..5f52bc0c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
@@ -1,30 +1,30 @@
-dataset_name: philosophy
-description: "The following are multiple choice questions (with answers) about philosophy.\n\
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \nQ: The study of reality in the broadest sense, an inquiry into the elemental nature\
   \ of the universe and the things in it, is known as _____.\n(A) metaphysics (B)\
   \ epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We\
   \ refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics\
   \ studies the nature of reality and existence. The answer is (A).\n\nQ: According\
-  \ to Moore\u2019s \u201Cideal utilitarianism,\u201D the right action is the one\
-  \ that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good.\
-  \ (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy\
-  \ for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize\
-  \ intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion,\
-  \ what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C)\
-  \ nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on philosophy for help. Before his conversion, Tolstoy feels that life\
-  \ was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ:\
-  \ According to d'Holbach, people always act according to _____.\n(A) free choices\
-  \ (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA:\
-  \ Let's think step by step. We refer to Wikipedia articles on philosophy for help.\
-  \ d'Holbach believes that people act according to necessary laws, and it proves\
-  \ nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism\
-  \ is:\n(A) an ethical theory about how we ought to behave. (B) a generalization\
-  \ concerning the way people tend to behave. (C) a claim about human nature and the\
-  \ ways people are capable of behaving. (D) none of the above.\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on philosophy for help. Psychological\
-  \ egoism suggests that one behaves based on what makes one feels good, hence it\
-  \ is a claim about human nature and how humans are capable of behaving. The answer\
-  \ is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_philosophy
+  \ to Moore’s “ideal utilitarianism,” the right action is the one that brings about\
+  \ the greatest amount of:\n(A) pleasure. (B) happiness. (C) good. (D) virtue.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on philosophy for help.\
+  \ Moore's \"ideal utilitarianism\" states that one's actions should maximize intrinsic\
+  \ goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion, what was\
+  \ his perspective on the meaning of life?\n(A) optimist (B) satisfied (C) nominally\
+  \ religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on philosophy for help. Before his conversion, Tolstoy feels that life was uncertain,\
+  \ which is a pessimist's point of view. The answer is (D).\n\nQ: According to d'Holbach,\
+  \ people always act according to _____.\n(A) free choices (B) dictates of the soul\
+  \ (C) necessary natural laws (D) undetermined will\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on philosophy for help. d'Holbach believes that\
+  \ people act according to necessary laws, and it proves nothing about people's free\
+  \ will. The answer is (C).\n\nQ: Psychological egoism is:\n(A) an ethical theory\
+  \ about how we ought to behave. (B) a generalization concerning the way people tend\
+  \ to behave. (C) a claim about human nature and the ways people are capable of behaving.\
+  \ (D) none of the above.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on philosophy for help. Psychological egoism suggests that one behaves based on\
+  \ what makes one feels good, hence it is a claim about human nature and how humans\
+  \ are capable of behaving. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
index e1c8dcc6..dc350126 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
@@ -1,67 +1,42 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  Q: What is the approximate mean cranial capacity of Homo erectus?
-
-  (A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer
-  is (C).
-
-
-  Q: According to Timothy Pauketat, the evidence for social stratification and political
-  power at Cahokia suggests:
-
-  (A) a center of Mississippian civilization with conditions similar to the rise of
-  early states. (B) the limitations of authority in a Native American society of egalitarian
-  foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D.
-  1500. (D) a center of Mississippian civilization with conditions similar to societies
-  on the Northwest Coast of North America.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian
-  culture, where he found similar conditions to the rise of early states. The answer
-  is (A).
-
-
-  Q: Recent research on hominid species dating from the Middle Pliocene indicates
-  there was (as of 2020):
-
-  (A) a great amount of species diversity, or a single species that exhibited a lot
-  of diversity. (B) very little species diversity during this period and very few
-  hominids. (C) decreased species diversity due to a prolonged ice age followed by
-  a severe drought. (D) decreased species diversity but increased numbers of hammerstones
-  and flakes, indicating stone tool manufacture.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Recent research has recognized multiple hominid species from the Middle Pliocene,
-  meaning that there is a great amount of species diversity or diversity in a single
-  species. The answer is (A).
-
-
-  Q: Researchers now believe that the decline of the Maya was caused chiefly by:
-
-  (A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological
-  degradation resulting from slash-and-burn farming techniques. (C) endless wars between
-  neighboring Mayan city-states. (D) practices of interbreeding that led to a steep
-  rise in congenital disorders.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Researchers believe that the Maya collapse was mainly caused by over-exploitation
-  of natural resources like the slash-and-burn farming techniques. The answer is (B).
-
-
-  Q: The great Mayan king Pacal built temples in the city of Palenque in order to:
-
-  (A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to
-  the common people, since they were allowed to live in the temples. (C) frighten
-  away enemies, in particular the Spaniards. (D) legitimize his kingship, since his
-  father was not royal.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Pacal built the temples as the funerary monument to legitimize his kingship. The
-  answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_prehistory
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \nQ: What is the approximate mean cranial capacity of Homo erectus?\n(A) under 650\
+  \ cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on prehistory for help. The average cranium\
+  \ capacity of Homo erectus is less than 1000 cubic cm. The answer is (C).\n\nQ:\
+  \ According to Timothy Pauketat, the evidence for social stratification and political\
+  \ power at Cahokia suggests:\n(A) a center of Mississippian civilization with conditions\
+  \ similar to the rise of early states. (B) the limitations of authority in a Native\
+  \ American society of egalitarian foragers. (C) a simple chiefdom or perhaps a complex\
+  \ chiefdom had evolved by A.D. 1500. (D) a center of Mississippian civilization\
+  \ with conditions similar to societies on the Northwest Coast of North America.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian\
+  \ culture, where he found similar conditions to the rise of early states. The answer\
+  \ is (A).\n\nQ: Recent research on hominid species dating from the Middle Pliocene\
+  \ indicates there was (as of 2020):\n(A) a great amount of species diversity, or\
+  \ a single species that exhibited a lot of diversity. (B) very little species diversity\
+  \ during this period and very few hominids. (C) decreased species diversity due\
+  \ to a prolonged ice age followed by a severe drought. (D) decreased species diversity\
+  \ but increased numbers of hammerstones and flakes, indicating stone tool manufacture.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Recent research has recognized multiple hominid species from the Middle Pliocene,\
+  \ meaning that there is a great amount of species diversity or diversity in a single\
+  \ species. The answer is (A).\n\nQ: Researchers now believe that the decline of\
+  \ the Maya was caused chiefly by:\n(A) a cataclysm of some kind, such as an earthquake,\
+  \ volcano, or tsunami. (B) ecological degradation resulting from slash-and-burn\
+  \ farming techniques. (C) endless wars between neighboring Mayan city-states. (D)\
+  \ practices of interbreeding that led to a steep rise in congenital disorders.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Researchers believe that the Maya collapse was mainly caused by over-exploitation\
+  \ of natural resources like the slash-and-burn farming techniques. The answer is\
+  \ (B).\n\nQ: The great Mayan king Pacal built temples in the city of Palenque in\
+  \ order to:\n(A) satisfy the powerful Mayan astronomer priests. (B) display his\
+  \ generosity to the common people, since they were allowed to live in the temples.\
+  \ (C) frighten away enemies, in particular the Spaniards. (D) legitimize his kingship,\
+  \ since his father was not royal.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on prehistory for help. Pacal built the temples as the funerary monument\
+  \ to legitimize his kingship. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
index c4957a1f..57538d21 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
@@ -1,7 +1,7 @@
-dataset_name: professional_accounting
-description: "The following are multiple choice questions (with answers) about professional\
-  \ accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019\
-  s subledger. Which of the following management assertions is supported by this test?\n\
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer’s\
+  \ subledger. Which of the following management assertions is supported by this test?\n\
   (A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation\
   \ and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on\
   \ accounting for help. The completeness assertion is tested by tracing supporting\
@@ -43,5 +43,6 @@ description: "The following are multiple choice questions (with answers) about p
   \ transactions, only Proceeds from long-term debt belongs to the financing activities\
   \ section of cashflow, hence the amount reported should be $100000. The answer is\
   \ (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_accounting
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
index f3a957db..82d7a960 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
@@ -1,5 +1,5 @@
-dataset_name: professional_law
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor\
   \ and told him that he wanted to pay the son's debt. The father signed a document\
   \ that stated the father would pay the son's debt at a rate of $500 a month for\
@@ -101,5 +101,6 @@ description: "The following are multiple choice questions (with answers) about p
   \ Amendment further supports the First Amendment by establishing a due process clause.\
   \ Hence the strongest argument should be the statute is overbroad and consequently\
   \ invalid under the First and Fourteenth Amendments. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_law
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
index 89b890f8..2263ae45 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
@@ -1,18 +1,18 @@
-dataset_name: professional_medicine
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with\
   \ the complaint of right-sided rib pain when he runs long distances. Physical examination\
-  \ reveals normal heart and lung findings and an exhalation dysfunction at ribs\_\
-  4-5 on the right. Which of the following muscles or muscle groups will be most useful\
+  \ reveals normal heart and lung findings and an exhalation dysfunction at ribs 4-5\
+  \ on the right. Which of the following muscles or muscle groups will be most useful\
   \ in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene\
   \ (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think\
   \ step by step. We refer to Wikipedia articles on medicine for help. Among the options,\
   \ only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th\
   \ ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with\
-  \ a\_3-week\_history of low back pain. He denies any recent trauma but says that\
-  \ he climbs in and out of his truck numerous times a day for his job. Examination\
-  \ of the patient in the prone position reveals a deep sacral sulcus on the left,\
-  \ a posterior inferior lateral angle on the right, and a lumbosacral junction that\
+  \ a 3-week history of low back pain. He denies any recent trauma but says that he\
+  \ climbs in and out of his truck numerous times a day for his job. Examination of\
+  \ the patient in the prone position reveals a deep sacral sulcus on the left, a\
+  \ posterior inferior lateral angle on the right, and a lumbosacral junction that\
   \ springs freely on compression. The most likely diagnosis is\n(A) left-on-left\
   \ sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion\
   \ (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia\
@@ -23,9 +23,9 @@ description: "The following are multiple choice questions (with answers) about p
   \ nonproductive cough, runny nose, and frontal headache. He says the headache is\
   \ worse in the morning and ibuprofen does provide some relief. He has not had shortness\
   \ of breath. Medical history is unremarkable. He takes no medications other than\
-  \ the ibuprofen for pain. Vital signs are temperature 37.4\xB0C (99.4\xB0F), pulse\
-  \ 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the\
-  \ nares shows erythematous mucous membranes. Examination of the throat shows erythema\
+  \ the ibuprofen for pain. Vital signs are temperature 37.4°C (99.4°F), pulse 88/min,\
+  \ respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares\
+  \ shows erythematous mucous membranes. Examination of the throat shows erythema\
   \ and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable\
   \ cervical adenopathy. Lungs are clear to auscultation. Which of the following is\
   \ the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr\
@@ -57,13 +57,14 @@ description: "The following are multiple choice questions (with answers) about p
   \ A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine\
   \ and metanephrine and plasma aldosterone concentrations. The patient was referred\
   \ to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature\
-  \ 36.6\xB0C (97.9\xB0F), pulse 100/min, respirations 14/min, and blood pressure\
-  \ 170/95 mm Hg. Physical examination discloses no significant findings. Initial\
-  \ preoperative preparation should include treatment with which of the following?\n\
-  (A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\n\
+  \ 36.6°C (97.9°F), pulse 100/min, respirations 14/min, and blood pressure 170/95\
+  \ mm Hg. Physical examination discloses no significant findings. Initial preoperative\
+  \ preparation should include treatment with which of the following?\n(A) Labetalol\
+  \ (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\n\
   A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
   \ The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure\
   \ indicates hypertension. Phenoxybenzamine is used to treat hypertension caused\
   \ by pheochromocytoma. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_medicine
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
index e1e5206d..42a9a42e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
@@ -1,5 +1,5 @@
-dataset_name: professional_psychology
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\nQ: In the construction of a multiple regression equation for purposes\
   \ of prediction, the optimal combination of measures is one in which the predictors\n\
   (A) are uncorrelated with each other but are moderately correlated with the criterion\
@@ -18,30 +18,31 @@ description: "The following are multiple choice questions (with answers) about p
   \ step by step. We refer to Wikipedia articles on psychology for help. The definition\
   \ of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl\
   \ Jung believed that a client's transference:\n(A) is a fantasy that distracts the\
-  \ client from reality. (B) represents \u201Cmixed feelings\u201D toward the therapist.\
-  \ (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal\
-  \ and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on psychology for help. Transference is a phenomenon that a person's\
-  \ feelings are unconsciously redirected, so it reflects the client's personal and\
-  \ collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980)\
-  \ five cultural dimensions, the United States scores at the top of the scale on:\n\
-  (A) individualism. (B) individualism and power distance. (C) power distance and\
-  \ masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on psychology for help. US scores highest on individualism\
-  \ among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy\
-  \ clients asks your advice about a good weight- reduction program. You have investigated\
-  \ the programs in the community and are enrolled in the one you consider the best.\
-  \ This program offers a $50 bonus to its patrons for each new person they bring\
-  \ into the program. Under these circumstances, your most appropriate response would\
-  \ be to\n(A) tell your client the pros and cons of each program you know about except\
-  \ for the one in which you are enrolled (B) recommend to your client the program\
-  \ in which you are enrolled and explain the $50 bonus you will receive (C) recommend\
-  \ to your client the program in which you are enrolled and offer to have the $50\
-  \ bonus credited to your client's account in the program (D) tell your client the\
-  \ pros and cons of each program you know about, but do not claim the $50 bonus if\
-  \ your client enrolls in your program\nA: Let's think step by step. We refer to\
-  \ Wikipedia articles on psychology for help. Based on the circumstances, you should\
-  \ tell your client about the pros and cons of each program, but it would be inappropriate\
-  \ to receive the bonus, so you should not claim the $50 bonus. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_psychology
+  \ client from reality. (B) represents “mixed feelings” toward the therapist. (C)\
+  \ \"is a form of \"\"acting out.\"\"\" (D) reflects the client’s personal and collective\
+  \ unconscious.\nA: Let's think step by step. We refer to Wikipedia articles on psychology\
+  \ for help. Transference is a phenomenon that a person's feelings are unconsciously\
+  \ redirected, so it reflects the client's personal and collective unconscious. The\
+  \ answer is (D).\n\nQ: In terms of Hofstede’s (1980) five cultural dimensions, the\
+  \ United States scores at the top of the scale on:\n(A) individualism. (B) individualism\
+  \ and power distance. (C) power distance and masculinity. (D) uncertainty avoidance.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ US scores highest on individualism among the five cultural dimensions. The answer\
+  \ is (A).\n\nQ: One of your therapy clients asks your advice about a good weight-\
+  \ reduction program. You have investigated the programs in the community and are\
+  \ enrolled in the one you consider the best. This program offers a $50 bonus to\
+  \ its patrons for each new person they bring into the program. Under these circumstances,\
+  \ your most appropriate response would be to\n(A) tell your client the pros and\
+  \ cons of each program you know about except for the one in which you are enrolled\
+  \ (B) recommend to your client the program in which you are enrolled and explain\
+  \ the $50 bonus you will receive (C) recommend to your client the program in which\
+  \ you are enrolled and offer to have the $50 bonus credited to your client's account\
+  \ in the program (D) tell your client the pros and cons of each program you know\
+  \ about, but do not claim the $50 bonus if your client enrolls in your program\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Based on the circumstances, you should tell your client about the pros and cons\
+  \ of each program, but it would be inappropriate to receive the bonus, so you should\
+  \ not claim the $50 bonus. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
index be4edf98..87e32dcc 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
@@ -1,65 +1,39 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
-
-
-  Q: Earth Hour was a campaign launched by which organization?
-
-  (A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife
-  Fund. The answer is (D).
-
-
-  Q: In issues management, what is the most proactive approach to addressing negative
-  or misleading information posted online about your organization?
-
-  (A) Buy domain names that could be used by opposition groups. (B) Post anonymous
-  comments on blogs to combat this information. (C) Prepare a news release that discredits
-  the inaccurate information. (D) Make policy changes to address complaints highlighted
-  on these sites.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. In issues management, the most proactive approach to addressing negative
-  or misleading information posted online is to make policy changes to address complaints
-  highlighted on those sites. The answer is (D).
-
-
-  Q: At which stage in the planning process would a situation analysis be carried
-  out?
-
-  (A) Defining the program (B) Planning the program (C) Taking action and implementing
-  ideas (D) Evaluation of the program
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. Situation analyses are typically carried out during the planning process
-  stage of defining the program. The answer is (A).
-
-
-  Q: Which of these statements is true of the Vatican in 2010 at the time of the accusations
-  of child abuse cover-ups?
-
-  (A) There was a coordinated media response. (B) Consistent messages were communicated.
-  (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility
-  of the Vatican was upheld.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican
-  took those criticisms as attacks on the Catholic Church. The answer is (C).
-
-
-  Q: What should a public relations media practitioner do if she does not know the
-  answer to a reporter''s question?
-
-  (A) Give the reporter other information she is certain is correct. (B) Say that
-  the information is ''off the record'' and will be disseminated later. (C) Say ''I
-  don''t know'' and promise to provide the information later. (D) Say ''no comment,''
-  rather than appear uninformed.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. If a public relations media practitioner does not know the answer to a
-  reporter''s question, they should say ''I don''t know'' and offer to provide the
-  information later. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_public_relations
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\nQ: Earth Hour was a campaign launched by which organization?\n(A)\
+  \ Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on public relations for help. Earth Hour\
+  \ is a worldwide movement oragnized launched by the World Wildlife Fund. The answer\
+  \ is (D).\n\nQ: In issues management, what is the most proactive approach to addressing\
+  \ negative or misleading information posted online about your organization?\n(A)\
+  \ Buy domain names that could be used by opposition groups. (B) Post anonymous comments\
+  \ on blogs to combat this information. (C) Prepare a news release that discredits\
+  \ the inaccurate information. (D) Make policy changes to address complaints highlighted\
+  \ on these sites.\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ public relations for help. In issues management, the most proactive approach to\
+  \ addressing negative or misleading information posted online is to make policy\
+  \ changes to address complaints highlighted on those sites. The answer is (D).\n\
+  \nQ: At which stage in the planning process would a situation analysis be carried\
+  \ out?\n(A) Defining the program (B) Planning the program (C) Taking action and\
+  \ implementing ideas (D) Evaluation of the program\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on public relations for help. Situation analyses\
+  \ are typically carried out during the planning process stage of defining the program.\
+  \ The answer is (A).\n\nQ: Which of these statements is true of the Vatican in 2010\
+  \ at the time of the accusations of child abuse cover-ups?\n(A) There was a coordinated\
+  \ media response. (B) Consistent messages were communicated. (C) Criticisms were\
+  \ taken as attacks on the Catholic Church. (D) The credibility of the Vatican was\
+  \ upheld.\nA: Let's think step by step. We refer to Wikipedia articles on public\
+  \ relations for help. In 2010 when there were accusations of child abuse cover-ups,\
+  \ the Vatican took those criticisms as attacks on the Catholic Church. The answer\
+  \ is (C).\n\nQ: What should a public relations media practitioner do if she does\
+  \ not know the answer to a reporter's question?\n(A) Give the reporter other information\
+  \ she is certain is correct. (B) Say that the information is 'off the record' and\
+  \ will be disseminated later. (C) Say 'I don't know' and promise to provide the\
+  \ information later. (D) Say 'no comment,' rather than appear uninformed.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on public relations for help.\
+  \ If a public relations media practitioner does not know the answer to a reporter's\
+  \ question, they should say 'I don't know' and offer to provide the information\
+  \ later. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
index b08c321a..afc3199d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
@@ -1,5 +1,5 @@
-dataset_name: security_studies
-description: "The following are multiple choice questions (with answers) about security\
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\nQ: What are the frameworks of analysis within which terrorism has\
   \ been considered (as of 2020)?\n(A) Competition between larger nations has resulted\
   \ in some countries actively supporting terrorist groups to undermine the strength\
@@ -81,5 +81,6 @@ description: "The following are multiple choice questions (with answers) about s
   \ for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia\
   \ articles on security studies for help. Coercive diplomacy uses the threat of force\
   \ to induce the opponent to comply with demands. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_security_studies
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
index 38974b00..27de15a5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
@@ -1,67 +1,43 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  Q: Which of the following is not a problem associated with official statistics on
-  strike action?
-
-  (A) most strikes go unnoticed by employers and the mass media (B) not all industrial
-  disputes will be reported by the employer (C) the definition of strikes excludes
-  those that involve fewer than ten workers or last less than one day (D) it is hard
-  to compare strikes that were measured in different ways
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Official statistics on strike action can be problematic because not all industrial
-  disputes will be reported by employers, the definition of strikes excludes those
-  that involves fewer than ten workers or last less than one day, and it is hard to
-  compare strikes that were measured in different ways. Thus, (A) is not a problem
-  associated with official statistics on strike action. The answer is (A).
-
-
-  Q: What does Berger (1963) describe as a metaphor for social reality?
-
-  (A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Berger describes social reality using the metaphor of a puppet theatre. The answer
-  is (C).
-
-
-  Q: The term ''hegemony'' refers to:
-
-  (A) the tendency for the working class not to realize their own interests (B) a
-  dominant ideology that legitimates economic, political and cultural power (C) a
-  form of dual consciousness based on ideology and everyday experiences (D) a mode
-  of payment given for outstanding topiary
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Hegemony refers to a dominant ideology that legitimates economic, policital, and
-  cultural power. The answer is (B).
-
-
-  Q: The shift from ''civil religion'' to ''common religion'' means that:
-
-  (A) the increasing bureaucracy of the state has made religion only a marginal part
-  of our lives (B) despite the weakening of traditional authority, our everyday lives
-  and ''common sense'' remain shaped by religious beliefs and values (C) religious
-  participation in collective worship may have declined, but people still practise
-  their faiths in private (D) people are much more likely to discuss their religious
-  beliefs in public, informal settings
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  The shift from civil religion to common religion means that despite the weakening
-  of traditional authority, our everyday lives and common sense remain shaped by religious
-  beliefs and values. The answer is (B).
-
-
-  Q: Which of the following did the post-war welfare state of 1948 not aim to provide:
-
-  (A) free health care and education for all (B) a minimum wage (C) full employment
-  (D) universal welfare
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  The post-war welfare state of 1948 aimed to provide free healthcare and education,
-  full employment, and universal welfare. But it did not aim to provide a minimum
-  wage. The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_sociology
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \nQ: Which of the following is not a problem associated with official statistics\
+  \ on strike action?\n(A) most strikes go unnoticed by employers and the mass media\
+  \ (B) not all industrial disputes will be reported by the employer (C) the definition\
+  \ of strikes excludes those that involve fewer than ten workers or last less than\
+  \ one day (D) it is hard to compare strikes that were measured in different ways\n\
+  A: Let's think step by step. We refer to Wikipedia articles on sociology for help.\
+  \ Official statistics on strike action can be problematic because not all industrial\
+  \ disputes will be reported by employers, the definition of strikes excludes those\
+  \ that involves fewer than ten workers or last less than one day, and it is hard\
+  \ to compare strikes that were measured in different ways. Thus, (A) is not a problem\
+  \ associated with official statistics on strike action. The answer is (A).\n\nQ:\
+  \ What does Berger (1963) describe as a metaphor for social reality?\n(A) a fairground\
+  \ ride (B) a circus (C) a puppet theatre (D) a ballet\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on sociology for help. Berger describes social\
+  \ reality using the metaphor of a puppet theatre. The answer is (C).\n\nQ: The term\
+  \ 'hegemony' refers to:\n(A) the tendency for the working class not to realize their\
+  \ own interests (B) a dominant ideology that legitimates economic, political and\
+  \ cultural power (C) a form of dual consciousness based on ideology and everyday\
+  \ experiences (D) a mode of payment given for outstanding topiary\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on sociology for help. Hegemony refers\
+  \ to a dominant ideology that legitimates economic, policital, and cultural power.\
+  \ The answer is (B).\n\nQ: The shift from 'civil religion' to 'common religion'\
+  \ means that:\n(A) the increasing bureaucracy of the state has made religion only\
+  \ a marginal part of our lives (B) despite the weakening of traditional authority,\
+  \ our everyday lives and 'common sense' remain shaped by religious beliefs and values\
+  \ (C) religious participation in collective worship may have declined, but people\
+  \ still practise their faiths in private (D) people are much more likely to discuss\
+  \ their religious beliefs in public, informal settings\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on sociology for help. The shift from civil religion\
+  \ to common religion means that despite the weakening of traditional authority,\
+  \ our everyday lives and common sense remain shaped by religious beliefs and values.\
+  \ The answer is (B).\n\nQ: Which of the following did the post-war welfare state\
+  \ of 1948 not aim to provide:\n(A) free health care and education for all (B) a\
+  \ minimum wage (C) full employment (D) universal welfare\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on sociology for help. The post-war welfare\
+  \ state of 1948 aimed to provide free healthcare and education, full employment,\
+  \ and universal welfare. But it did not aim to provide a minimum wage. The answer\
+  \ is (B)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
index 6340aee3..fb996730 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
@@ -1,66 +1,40 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
-
-
-  Q: How did Donald Trump attack globalization in the 2016 campaign?
-
-  (A) Globalization had made men like him too rich (B) Globalization only benefited
-  certain American states, such as New York (C) Liberal elites had encouraged globalization,
-  while ''ordinary Americans'' lost jobs because of it (D) Globalization encouraged
-  damaging trade wars
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. Trump attacked globalization because he believed ordinary Americans lost
-  jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer
-  is (C).
-
-
-  Q: How did NSC-68 change U.S. strategy?
-
-  (A) It globalized containment. (B) It militarized containment. (C) It called for
-  the development of the hydrogen bomb. (D) All of the above
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. NSC-68 outlined a variety of courses of action, including globalization
-  of containment, militarization of contaiment, and the development of the hydrogen
-  bomb. The answer is (D).
-
-
-  Q: How do Defensive Realism and Offensive Realism differ in their explanation of
-  state behaviour?
-
-  (A) Defensive realists place greater emphasis on the role of international institutions
-  (B) Defensive realists place less emphasis on geographical factors (C) Offensive
-  realists give more priority to the national interest than Defensive realists. (D)
-  Defensive realists believe states are security maximizers, while Offensive realists
-  believe states to be power maximizers
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. While defensive realism advocates that states are security maximizers,
-  offensive realists think of states as power maximizers. The answer is (D).
-
-
-  Q: The realm of policy decisions concerned primarily with relations between the
-  United States and the rest of the world is known as
-
-  (A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international
-  policy.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. The topic of policy decisions concerns with relations between the US and
-  the rest of the world is known as foreign policy. The answer is (C).
-
-
-  Q: How did the 2008 financial crisis affect America''s international reputation?
-
-  (A) It damaged support for the US model of political economy and capitalism (B)
-  It created anger at the United States for exaggerating the crisis (C) It increased
-  support for American global leadership under President Obama (D) It reduced global
-  use of the US dollar
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. The 2008 financial crisis damanged the international reputation of the
-  American model of political economy and capitalism. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_us_foreign_policy
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\nQ: How did Donald Trump attack globalization in the 2016 campaign?\n\
+  (A) Globalization had made men like him too rich (B) Globalization only benefited\
+  \ certain American states, such as New York (C) Liberal elites had encouraged globalization,\
+  \ while 'ordinary Americans' lost jobs because of it (D) Globalization encouraged\
+  \ damaging trade wars\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us foreign policy for help. Trump attacked globalization because he believed\
+  \ ordinary Americans lost jobs due to it, and so he wanted to blame liberals who\
+  \ had encouraged it. The answer is (C).\n\nQ: How did NSC-68 change U.S. strategy?\n\
+  (A) It globalized containment. (B) It militarized containment. (C) It called for\
+  \ the development of the hydrogen bomb. (D) All of the above\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on us foreign policy for help. NSC-68\
+  \ outlined a variety of courses of action, including globalization of containment,\
+  \ militarization of contaiment, and the development of the hydrogen bomb. The answer\
+  \ is (D).\n\nQ: How do Defensive Realism and Offensive Realism differ in their explanation\
+  \ of state behaviour?\n(A) Defensive realists place greater emphasis on the role\
+  \ of international institutions (B) Defensive realists place less emphasis on geographical\
+  \ factors (C) Offensive realists give more priority to the national interest than\
+  \ Defensive realists. (D) Defensive realists believe states are security maximizers,\
+  \ while Offensive realists believe states to be power maximizers\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on us foreign policy for help. While\
+  \ defensive realism advocates that states are security maximizers, offensive realists\
+  \ think of states as power maximizers. The answer is (D).\n\nQ: The realm of policy\
+  \ decisions concerned primarily with relations between the United States and the\
+  \ rest of the world is known as\n(A) terrorism policy. (B) economic policy. (C)\
+  \ foreign policy. (D) international policy.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on us foreign policy for help. The topic of policy decisions\
+  \ concerns with relations between the US and the rest of the world is known as foreign\
+  \ policy. The answer is (C).\n\nQ: How did the 2008 financial crisis affect America's\
+  \ international reputation?\n(A) It damaged support for the US model of political\
+  \ economy and capitalism (B) It created anger at the United States for exaggerating\
+  \ the crisis (C) It increased support for American global leadership under President\
+  \ Obama (D) It reduced global use of the US dollar\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on us foreign policy for help. The 2008 financial\
+  \ crisis damanged the international reputation of the American model of political\
+  \ economy and capitalism. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
index 5bbd7a2c..71f8f8bf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
@@ -1,55 +1,31 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  Q: The median survival time to AIDS and death was established by following:
-
-  (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters
-  (D) High-risk seronegatives
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The median survival time to AIDS and death was established as a result of the development
-  of seroconverters. The answer is (C).
-
-
-  Q: Which of the following is a morphological characteristic of the paramyxoviruses.
-
-  (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate
-  viruses (C) Icosahedral viruses with envelope (D) Very large viruses
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside.
-  The answer is (A).
-
-
-  Q: The most important goal of a behavioral intervention is:
-
-  (A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral
-  theory (D) Sustained behavior change
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The prim goal of a behavioral intervention is to cause sustained behavior change.
-  The answer is (D).
-
-
-  Q: A key factor facilitating the application of nested case-control studies from
-  the MACS was:
-
-  (A) Data collection (B) Establishment of a repository of biologic specimens (C)
-  Participant interest (D) Administration of the questionnaire by staff
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The Multicenter AIDS Cohort Study''s use of nested case-control studies was facilitated
-  by the establishment of a repository of biologic specimens. The answer is (B).
-
-
-  Q: Why are parvoviruses a highly impactful parasite?
-
-  (A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate
-  in dividing cells (D) Can integrate into host chromosomes
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  Paroviruses are highly impactful because they do not have nucleic acid. The answer
-  is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_virology
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \nQ: The median survival time to AIDS and death was established by following:\n\
+  (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters\
+  \ (D) High-risk seronegatives\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on virology for help. The median survival time to AIDS and death was\
+  \ established as a result of the development of seroconverters. The answer is (C).\n\
+  \nQ: Which of the following is a morphological characteristic of the paramyxoviruses.\n\
+  (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate\
+  \ viruses (C) Icosahedral viruses with envelope (D) Very large viruses\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on virology for help. Paramyxoviruses\
+  \ are fragile viruses often visualised with RNA spewing from the inside. The answer\
+  \ is (A).\n\nQ: The most important goal of a behavioral intervention is:\n(A) Change\
+  \ in behavior (B) Comprehensive coverage (C) Effective use of behavioral theory\
+  \ (D) Sustained behavior change\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on virology for help. The prim goal of a behavioral intervention is to\
+  \ cause sustained behavior change. The answer is (D).\n\nQ: A key factor facilitating\
+  \ the application of nested case-control studies from the MACS was:\n(A) Data collection\
+  \ (B) Establishment of a repository of biologic specimens (C) Participant interest\
+  \ (D) Administration of the questionnaire by staff\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on virology for help. The Multicenter AIDS Cohort\
+  \ Study's use of nested case-control studies was facilitated by the establishment\
+  \ of a repository of biologic specimens. The answer is (B).\n\nQ: Why are parvoviruses\
+  \ a highly impactful parasite?\n(A) Because they have no nucleic acid (B) They require\
+  \ a helper virus (C) Only replicate in dividing cells (D) Can integrate into host\
+  \ chromosomes\nA: Let's think step by step. We refer to Wikipedia articles on virology\
+  \ for help. Paroviruses are highly impactful because they do not have nucleic acid.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_virology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
index c01adcdb..13390322 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
@@ -1,53 +1,27 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
-
-
-  Q: How can the Upanishads be characterized?
-
-  (A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The Upanishads are the most recent part of Vedas (the oldest scriptures
-  in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical
-  texts. The answer is (B).
-
-
-  Q: What is the Second Gem in Buddhism?
-
-  (A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The Second Gem in Buddhism is The Dharma. The answer is (A).
-
-
-  Q: Which Japanese government promoted a kind of national cult based on the emperor
-  and his associations with kami?
-
-  (A) Honen (B) Tanaka (C) Tokugawa (D) Meiji
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The promotion of a national cult based on the emperor and his associations
-  with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is
-  (D).
-
-
-  Q: In which dynasty was the "Mandate of Heaven" developed to legitimatize the new
-  rulers?
-
-  (A) Shang (B) Zhou (C) Han (D) Xia
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The "Mandate of Heaven" was developed as an ancient Chinese philosophical
-  concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).
-
-
-  Q: What is the sign of the covenant for Jewish males?
-
-  (A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. In Judaism, the most distinctive sign of the covenant is circumcision
-  (brit milah). The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_world_religions
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\nQ: How can the Upanishads be characterized?\n(A) Ritual texts (B)\
+  \ Philosophical texts (C) Hymns (D) Origin stories\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on world religions for help. The Upanishads are\
+  \ the most recent part of Vedas (the oldest scriptures in Hinduism) and supplied\
+  \ the basis of later Hindu philosophy. So they are philosophical texts. The answer\
+  \ is (B).\n\nQ: What is the Second Gem in Buddhism?\n(A) The Dharma (B) The Sangha\
+  \ (C) The Buddha (D) The Bodhisattva\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on world religions for help. The Second Gem in Buddhism is The Dharma.\
+  \ The answer is (A).\n\nQ: Which Japanese government promoted a kind of national\
+  \ cult based on the emperor and his associations with kami?\n(A) Honen (B) Tanaka\
+  \ (C) Tokugawa (D) Meiji\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on world religions for help. The promotion of a national cult based on the emperor\
+  \ and his associations with Kami happened during the reign of Emperor Meiji (1852-1912).\
+  \ The answer is (D).\n\nQ: In which dynasty was the \"Mandate of Heaven\" developed\
+  \ to legitimatize the new rulers?\n(A) Shang (B) Zhou (C) Han (D) Xia\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on world religions for help.\
+  \ The \"Mandate of Heaven\" was developed as an ancient Chinese philosophical concept\
+  \ during the Zhou Dynasty (1046-256 BCE). The answer is (B).\n\nQ: What is the sign\
+  \ of the covenant for Jewish males?\n(A) The rainbow (B) Circumcision (C) A son\
+  \ (D) Bar mitzvah\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ world religions for help. In Judaism, the most distinctive sign of the covenant\
+  \ is circumcision (brit milah). The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
new file mode 100644
index 00000000..390425c7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_cot_zeroshot
+task:
+  - mmlu_flan_cot_zeroshot_stem
+  - mmlu_flan_cot_zeroshot_other
+  - mmlu_flan_cot_zeroshot_social_sciences
+  - mmlu_flan_cot_zeroshot_humanities
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
similarity index 100%
rename from lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
rename to lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
index 17bccf1f..8609f626 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
@@ -1,8 +1,6 @@
-dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract
-  algebra.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_abstract_algebra
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
index 6e14fbc6..2923349d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
@@ -1,7 +1,6 @@
-dataset_name: anatomy
-description: 'The following are multiple choice questions (with answers) about anatomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_anatomy
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
index b1ca9f52..e5ffd8ff 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
@@ -1,7 +1,6 @@
-dataset_name: astronomy
-description: 'The following are multiple choice questions (with answers) about astronomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_astronomy
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
index 53f3a78f..a6428571 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
@@ -1,8 +1,6 @@
-dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business
-  ethics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_business_ethics
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
index f858d671..e3655230 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
@@ -1,8 +1,6 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_clinical_knowledge
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
index 93471b6a..736bb6de 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college
-  biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_biology
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
index 5f619baa..7b719966 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college
-  chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_chemistry
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
index 865b91bf..185f2a66 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_computer_science
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
index 1f8a89fa..210eb127 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college
-  mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_mathematics
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
index e852c64b..51c8a3c0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college
-  medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_medicine
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
index f215c2f0..319c7214 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_physics
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
index 402f7bdc..ae4bda96 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
@@ -1,8 +1,6 @@
-dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer
-  security.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_computer_security
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
index c3ad6376..2e1e43db 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual
-  physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_conceptual_physics
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
index dad5a83b..9ff25bba 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
@@ -1,7 +1,6 @@
-dataset_name: econometrics
-description: 'The following are multiple choice questions (with answers) about econometrics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_econometrics
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
index 72a08dca..ca10a43e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
@@ -1,8 +1,6 @@
-dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical
-  engineering.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_electrical_engineering
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
index 0531f23e..065c92d2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary
-  mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_elementary_mathematics
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
index 80b26401..ec2d323c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
@@ -1,8 +1,6 @@
-dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal
-  logic.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_formal_logic
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
index 491d0db4..b1e29a3e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
@@ -1,8 +1,6 @@
-dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global
-  facts.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_global_facts
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
index 32da2e26..0e5794db 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high
-  school biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_biology
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
index 5968e54e..eba398b0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high
-  school chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_chemistry
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
index 2666de90..4a69dbb3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high
-  school computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_computer_science
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
index fb59ada4..54eafb51 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high
-  school european history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_european_history
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
index ed3fca55..0898c876 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_geography
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
index 62803b4b..d82fb6b0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_government_and_politics
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
index f973b58d..b94fc2a6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_macroeconomics
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
index 550dfcf1..dff0960a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high
-  school mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_mathematics
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
index 8a1e4c4c..75a08c48 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_microeconomics
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
index 4997e712..177d42da 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high
-  school physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_physics
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
index a3e801ca..d5d47723 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_psychology
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
index d057cbef..b245cf9e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_statistics
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
index 583d9591..2e187da2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high
-  school us history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_us_history
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
index 40445582..c89dd0fa 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high
-  school world history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_world_history
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
index c6db4c1c..230781b4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
@@ -1,8 +1,6 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_human_aging
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
index 41795660..ed2116dd 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
@@ -1,8 +1,6 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_human_sexuality
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
index da1273b0..d777e9fc 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
@@ -1,8 +1,6 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_international_law
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
index e1a6a28b..62b86dd0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
@@ -1,7 +1,6 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_jurisprudence
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
index e94cde17..07ae8438 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
@@ -1,8 +1,6 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_logical_fallacies
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
index a17387bd..cd4813ef 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
@@ -1,8 +1,6 @@
-dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine
-  learning.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_machine_learning
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
index 68fc6ba2..b7164c1c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
@@ -1,7 +1,6 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_management
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_management"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
index f6c6444c..0827f78d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
@@ -1,7 +1,6 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_marketing
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
index 2490826b..1706ee5b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
@@ -1,8 +1,6 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_medical_genetics
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
index 5aebaef8..295d801a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
@@ -1,7 +1,6 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_miscellaneous
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
index 85829454..a4595f06 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
@@ -1,8 +1,6 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_moral_disputes
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
index f8a31ddc..a0e41ae4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
@@ -1,8 +1,6 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_moral_scenarios
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
index 238c3f1c..4c87be43 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
@@ -1,7 +1,6 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_nutrition
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
index c4a8fb47..534707cb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
@@ -1,7 +1,6 @@
-dataset_name: philosophy
-description: 'The following are multiple choice questions (with answers) about philosophy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_philosophy
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
index 07f31813..3233ba4e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
@@ -1,7 +1,6 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_prehistory
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
index 82b5ff2c..021090c6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional
-  accounting.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_accounting
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
index 32210b49..73d115d7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional
-  law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_law
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
index ed9eebe1..47cf9573 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional
-  medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_medicine
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
index 7110b840..cc055d5b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional
-  psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_psychology
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
index 5138cdd8..14d02c3a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
@@ -1,8 +1,6 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_public_relations
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
index 84c359d7..cae551e2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
@@ -1,8 +1,6 @@
-dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security
-  studies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_security_studies
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
index fed1dc49..45b94193 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
@@ -1,7 +1,6 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_sociology
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
index d94f60e9..52e48277 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
@@ -1,8 +1,6 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_us_foreign_policy
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
index feaa8b06..fda1af06 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
@@ -1,7 +1,6 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_virology
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_virology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
index fe2b4c42..40518282 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
@@ -1,8 +1,6 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_world_religions
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
new file mode 100644
index 00000000..7705a171
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_n_shot_generative
+task:
+  - mmlu_flan_n_shot_generative_stem
+  - mmlu_flan_n_shot_generative_other
+  - mmlu_flan_n_shot_generative_social_sciences
+  - mmlu_flan_n_shot_generative_humanities
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
similarity index 100%
rename from lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
index 49b9c425..40cced2c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
similarity index 70%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
index 0c8d7914..606049a5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
similarity index 70%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
index c92a1027..db5faa22 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
index 4b65902e..add2ffb4 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
index 295fb234..e3f24569 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
index f945181b..3772b0e6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
index 1fdab27d..cedcf0cc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
index 6b41a5bb..a060903a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
index 29e80a5e..1899ce65 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
index a5061541..ab052dd6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
index ec3262ee..3b1e64e6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
index a9ade9c5..cd312a93 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
index 5a903a65..49e6b38c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
similarity index 67%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
index 847c8ce6..d9b4ebfc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
index 038379e0..e6efb0ef 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
index 4fd779de..b33cf318 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
index bb528831..1c2ad3a1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
index 1145dcab..a2352ff7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
index 574a0c58..0b51f34a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
index ef79ed73..0066ba77 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
index 9d9200a6..b0d4ef15 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
similarity index 67%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
index e4b52a9c..4b17db63 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
index 8403d20e..93f8de20 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
index 50ad3863..7ae12c17 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
index 18bfb8b1..71d82259 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
index 1b04a06f..20d31e12 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
index 9588af59..5c6d6ef9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
index 4aa033c8..5b016778 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
index 168c0c15..1abf244c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
index ba195da9..df3e8d93 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
index 0605fbc4..68e3f0a9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
index aa54d758..dfb839c3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
index d47b7fef..a857698f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
similarity index 67%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
index 9be15e54..3dde3af5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
index b80c9d58..be9018cf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
index 5e7a5395..f6b3c7ae 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
index fcb718a0..c6363390 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
index d879b54c..64496cfb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
index 887c71a3..63292cc1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
index bad500ca..0716dc14 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
index c4faff12..92115979 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
index e9aac340..74e88944 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
index 41af33e0..58bf43df 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
index 1689c3d9..9630e517 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
index 24be1a63..df14da9d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
index 01040729..20f5d60b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
index fc5a6fbe..3695e770 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
index cbdd2f0d..222642ac 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
index 42e46529..b4d39e49 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
index a64610e6..c420d0d3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
index b0c574fe..c5ba7495 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
index ff1030fc..9aa7d686 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
index 25555da4..6d2e0cda 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
similarity index 67%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
index f8ac254c..3c42d0b9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
similarity index 66%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
index af3917ac..5c514725 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
similarity index 69%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
index b8df2d59..fb083b62 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
similarity index 68%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
index 496f66c5..0f2c199a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
new file mode 100644
index 00000000..7705a171
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_n_shot_generative
+task:
+  - mmlu_flan_n_shot_generative_stem
+  - mmlu_flan_n_shot_generative_other
+  - mmlu_flan_n_shot_generative_social_sciences
+  - mmlu_flan_n_shot_generative_humanities
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
similarity index 94%
rename from lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index 5db2981a..2d5d92ef 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -12,4 +12,4 @@ metric_list:
     higher_is_better: true
   - metric: acc_norm
     aggregation: mean
-    higher_is_better: true
+    higher_is_better: true
\ No newline at end of file
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
index 4ea918d6..7ac6123b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_abstract_algebra"
+"task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
similarity index 53%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
index 9205bd31..2790a593 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_anatomy"
+"task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
similarity index 52%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
index dcd41de7..199e9560 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_astronomy"
+"task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
index 2b57abf3..4a346cd5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_business_ethics"
+"task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
index 5b5da42e..8e27f055 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_clinical_knowledge"
+"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
index c8cc429d..91a91c67 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_biology"
+"task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
index 8be3a04d..8d3ddf27 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_chemistry"
+"task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
new file mode 100644
index 00000000..1a37e75a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
index a9fe1814..6ef3d578 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_mathematics"
+"task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
index 6f5d767a..2bd3c63e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_medicine"
+"task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
index c6c22a40..174a4eee 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_physics"
+"task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
index 96bccc15..b5eed81a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_computer_security"
+"task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
index 2fc15ed0..c165c498 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_conceptual_physics"
+"task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
index 07dbf921..94ca68fe 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_econometrics"
+"task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
index 94492b11..7f72ffca 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_electrical_engineering"
+"task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
index 2cc56ef8..091c7a90 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_elementary_mathematics"
+"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
index 17e28205..64a3d11d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_formal_logic"
+"task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
index 2b3cb863..1ec7cc2c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_global_facts"
+"task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
index ed3e70b2..2b2e15a0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_biology"
+"task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
index 729d37fa..549aea5f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_chemistry"
+"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
new file mode 100644
index 00000000..bdbcfe93
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
new file mode 100644
index 00000000..855db984
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
new file mode 100644
index 00000000..6744db9f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..c51d372f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..d0bf0220
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
index 83244ebb..958ab60b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_mathematics"
+"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 00000000..8eaf6059
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
index 25c32369..208bf5b9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_physics"
+"task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
new file mode 100644
index 00000000..c11af0a6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
index fa9075f5..a5babfe5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_statistics"
+"task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
new file mode 100644
index 00000000..10306c2e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
new file mode 100644
index 00000000..db7c1c11
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
index d70d5e85..a3935d43 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_human_aging"
+"task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
new file mode 100644
index 00000000..4672103c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
index 03fab6ef..be63a3c5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_international_law"
+"task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
index bb6bfc6f..8e0a8191 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_jurisprudence"
+"task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
index d57576cd..8c920895 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_logical_fallacies"
+"task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
index 2c586922..f9aad4df 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_machine_learning"
+"task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
similarity index 52%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
index 66b14f7f..4709c403 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_management"
+"task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
similarity index 52%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
index aacee467..808f1c78 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_marketing"
+"task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
index 72d607fb..3c0a99f8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_medical_genetics"
+"task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
index 14db1ba8..c363f1bd 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_miscellaneous"
+"task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
index 0beccf44..d710816f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_moral_disputes"
+"task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
index 4d884b63..7d26770c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_moral_scenarios"
+"task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
similarity index 52%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
index ba1fdf61..677185b3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_nutrition"
+"task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
index 21645e77..8c4b6f22 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_philosophy"
+"task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
similarity index 51%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
index 74d9f30c..64065a6f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_prehistory"
+"task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
new file mode 100644
index 00000000..4fb590f8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
index 15fdad65..581b9da7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_law"
+"task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
index 1bcc6a9a..c49f9119 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_medicine"
+"task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
new file mode 100644
index 00000000..2d6f441d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
new file mode 100644
index 00000000..3d330fc9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
new file mode 100644
index 00000000..8bbe963f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
index c583cf24..0cc86bcc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_sociology"
+"task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
new file mode 100644
index 00000000..12ac4f36
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
similarity index 52%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
index c2cafd9b..6e942396 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_virology"
+"task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
similarity index 50%
rename from lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
index b1d1de0f..30f97421 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
@@ -1,4 +1,6 @@
 "dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_world_religions"
+"task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
deleted file mode 100644
index 506ee760..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
deleted file mode 100644
index 7003e94c..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
deleted file mode 100644
index 0ad96085..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
deleted file mode 100644
index f26e8bc6..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
deleted file mode 100644
index 523e278d..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
deleted file mode 100644
index 6b08a4fc..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
deleted file mode 100644
index 982f3f08..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
deleted file mode 100644
index a6e431db..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
deleted file mode 100644
index 094f95d0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
deleted file mode 100644
index 6ffd6d08..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
deleted file mode 100644
index 39751188..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
deleted file mode 100644
index 9010995f..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
deleted file mode 100644
index 9144805c..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
deleted file mode 100644
index 0b4adc04..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
deleted file mode 100644
index 2f4178f0..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
deleted file mode 100644
index f41d3c27..00000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_us_foreign_policy"
-- 
GitLab


From c64bf9a98c7d2526674ff23bb59823217e2ead38 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 14:42:37 +0000
Subject: [PATCH 114/212] change all mentions of `greedy_until` to
 `generate_until`

---
 lm_eval/api/task.py                           | 18 ++---
 lm_eval/benchmarks/__init__.py                | 76 -------------------
 lm_eval/models/anthropic_llms.py              |  6 +-
 lm_eval/models/dummy.py                       |  2 +-
 lm_eval/models/huggingface.py                 |  4 +-
 lm_eval/models/openai_completions.py          |  6 +-
 lm_eval/models/textsynth.py                   | 16 ++--
 lm_eval/tasks/__init__.py                     |  2 +-
 lm_eval/tasks/babi/babi.yaml                  |  2 +-
 .../_flan_cot_fewshot_template_yaml           |  2 +-
 .../_flan_cot_zeroshot_template_yaml          |  2 +-
 .../flan_fewshot/_flan_fewshot_template_yaml  |  2 +-
 .../_flan_zeroshot_template_yaml              |  2 +-
 .../flan/yaml_templates/cot_template_yaml     |  2 +-
 .../flan/yaml_templates/held_in_template_yaml |  2 +-
 .../{ => tasks}/benchmarks/minerva_math.yaml  |  0
 lm_eval/tasks/benchmarks/t0_eval.yaml         | 20 ++---
 lm_eval/tasks/bigbench/generate_tasks.py      |  4 +-
 .../abstract_narrative_understanding.yaml     |  4 +
 .../bigbench/generate_until/anachronisms.yaml |  4 +
 .../generate_until/analogical_similarity.yaml |  4 +
 .../generate_until/analytic_entailment.yaml   |  4 +
 .../bigbench/generate_until/arithmetic.yaml   |  4 +
 .../ascii_word_recognition.yaml               |  4 +
 .../authorship_verification.yaml              |  4 +
 .../generate_until/auto_categorization.yaml   |  4 +
 .../generate_until/auto_debugging.yaml        |  4 +
 .../generate_until/bbq_lite_json.yaml         |  4 +
 .../bridging_anaphora_resolution_barqa.yaml   |  4 +
 .../generate_until/causal_judgment.yaml       |  4 +
 .../generate_until/cause_and_effect.yaml      |  4 +
 .../generate_until/checkmate_in_one.yaml      |  4 +
 .../generate_until/chess_state_tracking.yaml  |  4 +
 .../chinese_remainder_theorem.yaml            |  4 +
 .../cifar10_classification.yaml               |  4 +
 .../generate_until/code_line_description.yaml |  4 +
 .../bigbench/generate_until/codenames.yaml    |  4 +
 .../tasks/bigbench/generate_until/color.yaml  |  4 +
 .../generate_until/common_morpheme.yaml       |  4 +
 .../conceptual_combinations.yaml              |  4 +
 .../generate_until/conlang_translation.yaml   |  4 +
 ...extual_parametric_knowledge_conflicts.yaml |  4 +
 .../generate_until/crash_blossom.yaml         |  4 +
 .../bigbench/generate_until/crass_ai.yaml     |  4 +
 .../generate_until/cryobiology_spanish.yaml   |  4 +
 .../bigbench/generate_until/cryptonite.yaml   |  4 +
 .../generate_until/cs_algorithms.yaml         |  4 +
 .../generate_until/dark_humor_detection.yaml  |  4 +
 .../generate_until/date_understanding.yaml    |  4 +
 .../generate_until/disambiguation_qa.yaml     |  4 +
 .../discourse_marker_prediction.yaml          |  4 +
 .../bigbench/generate_until/disfl_qa.yaml     |  4 +
 .../generate_until/dyck_languages.yaml        |  4 +
 .../generate_until/elementary_math_qa.yaml    |  4 +
 .../bigbench/generate_until/emoji_movie.yaml  |  4 +
 .../emojis_emotion_prediction.yaml            |  4 +
 .../generate_until/empirical_judgments.yaml   |  4 +
 .../generate_until/english_proverbs.yaml      |  4 +
 .../english_russian_proverbs.yaml             |  4 +
 .../generate_until/entailed_polarity.yaml     |  4 +
 .../entailed_polarity_hindi.yaml              |  4 +
 .../generate_until/epistemic_reasoning.yaml   |  4 +
 .../evaluating_information_essentiality.yaml  |  4 +
 .../bigbench/generate_until/fact_checker.yaml |  4 +
 .../generate_until/fantasy_reasoning.yaml     |  4 +
 .../bigbench/generate_until/few_shot_nlg.yaml |  4 +
 .../figure_of_speech_detection.yaml           |  4 +
 .../formal_fallacies_syllogisms_negation.yaml |  4 +
 .../tasks/bigbench/generate_until/gem.yaml    |  4 +
 .../gender_inclusive_sentences_german.yaml    |  4 +
 .../generate_until/general_knowledge.yaml     |  4 +
 .../generate_until/geometric_shapes.yaml      |  4 +
 .../generate_until/goal_step_wikihow.yaml     |  4 +
 .../gre_reading_comprehension.yaml            |  4 +
 .../generate_until/hhh_alignment.yaml         |  4 +
 .../hindi_question_answering.yaml             |  4 +
 .../generate_until/hindu_knowledge.yaml       |  4 +
 .../generate_until/hinglish_toxicity.yaml     |  4 +
 .../generate_until/human_organs_senses.yaml   |  4 +
 .../bigbench/generate_until/hyperbaton.yaml   |  4 +
 .../identify_math_theorems.yaml               |  4 +
 .../generate_until/identify_odd_metaphor.yaml |  4 +
 .../bigbench/generate_until/implicatures.yaml |  4 +
 .../generate_until/implicit_relations.yaml    |  4 +
 .../generate_until/intent_recognition.yaml    |  4 +
 .../international_phonetic_alphabet_nli.yaml  |  4 +
 ...ional_phonetic_alphabet_transliterate.yaml |  4 +
 .../generate_until/intersect_geometry.yaml    |  4 +
 .../generate_until/irony_identification.yaml  |  4 +
 .../bigbench/generate_until/kanji_ascii.yaml  |  4 +
 .../bigbench/generate_until/kannada.yaml      |  4 +
 .../generate_until/key_value_maps.yaml        |  4 +
 .../generate_until/known_unknowns.yaml        |  4 +
 .../generate_until/language_games.yaml        |  4 +
 .../language_identification.yaml              |  4 +
 .../generate_until/linguistic_mappings.yaml   |  4 +
 .../generate_until/linguistics_puzzles.yaml   |  4 +
 .../generate_until/list_functions.yaml        |  4 +
 .../generate_until/logic_grid_puzzle.yaml     |  4 +
 .../bigbench/generate_until/logical_args.yaml |  4 +
 .../generate_until/logical_deduction.yaml     |  4 +
 .../logical_fallacy_detection.yaml            |  4 +
 .../generate_until/logical_sequence.yaml      |  4 +
 .../mathematical_induction.yaml               |  4 +
 .../bigbench/generate_until/matrixshapes.yaml |  4 +
 .../generate_until/metaphor_boolean.yaml      |  4 +
 .../metaphor_understanding.yaml               |  4 +
 .../generate_until/minute_mysteries_qa.yaml   |  4 +
 .../generate_until/misconceptions.yaml        |  4 +
 .../misconceptions_russian.yaml               |  4 +
 .../bigbench/generate_until/mnist_ascii.yaml  |  4 +
 .../generate_until/modified_arithmetic.yaml   |  4 +
 .../generate_until/moral_permissibility.yaml  |  4 +
 .../movie_dialog_same_or_different.yaml       |  4 +
 .../generate_until/movie_recommendation.yaml  |  4 +
 .../generate_until/mult_data_wrangling.yaml   |  4 +
 .../bigbench/generate_until/multiemo.yaml     |  4 +
 .../generate_until/natural_instructions.yaml  |  4 +
 .../bigbench/generate_until/navigate.yaml     |  4 +
 .../nonsense_words_grammar.yaml               |  4 +
 .../generate_until/novel_concepts.yaml        |  4 +
 .../generate_until/object_counting.yaml       |  4 +
 .../bigbench/generate_until/odd_one_out.yaml  |  4 +
 .../bigbench/generate_until/operators.yaml    |  4 +
 .../paragraph_segmentation.yaml               |  4 +
 .../bigbench/generate_until/parsinlu_qa.yaml  |  4 +
 .../parsinlu_reading_comprehension.yaml       |  4 +
 .../generate_until/penguins_in_a_table.yaml   |  4 +
 .../generate_until/periodic_elements.yaml     |  4 +
 .../generate_until/persian_idioms.yaml        |  4 +
 .../generate_until/phrase_relatedness.yaml    |  4 +
 .../generate_until/physical_intuition.yaml    |  4 +
 .../bigbench/generate_until/physics.yaml      |  4 +
 .../generate_until/physics_questions.yaml     |  4 +
 .../play_dialog_same_or_different.yaml        |  4 +
 .../polish_sequence_labeling.yaml             |  4 +
 .../presuppositions_as_nli.yaml               |  4 +
 .../bigbench/generate_until/qa_wikidata.yaml  |  4 +
 .../generate_until/question_selection.yaml    |  4 +
 .../generate_until/real_or_fake_text.yaml     |  4 +
 .../reasoning_about_colored_objects.yaml      |  4 +
 .../generate_until/repeat_copy_logic.yaml     |  4 +
 .../bigbench/generate_until/rephrase.yaml     |  4 +
 .../bigbench/generate_until/riddle_sense.yaml |  4 +
 .../bigbench/generate_until/ruin_names.yaml   |  4 +
 .../salient_translation_error_detection.yaml  |  4 +
 .../scientific_press_release.yaml             |  4 +
 .../semantic_parsing_in_context_sparc.yaml    |  4 +
 .../semantic_parsing_spider.yaml              |  4 +
 .../generate_until/sentence_ambiguity.yaml    |  4 +
 .../similarities_abstraction.yaml             |  4 +
 .../generate_until/simp_turing_concept.yaml   |  4 +
 .../simple_arithmetic_json.yaml               |  4 +
 ...imple_arithmetic_json_multiple_choice.yaml |  4 +
 .../simple_arithmetic_json_subtasks.yaml      |  4 +
 ...mple_arithmetic_multiple_targets_json.yaml |  4 +
 .../simple_ethical_questions.yaml             |  4 +
 .../generate_until/simple_text_editing.yaml   |  4 +
 .../tasks/bigbench/generate_until/snarks.yaml |  4 +
 .../bigbench/generate_until/social_iqa.yaml   |  4 +
 .../generate_until/social_support.yaml        |  4 +
 .../generate_until/sports_understanding.yaml  |  4 +
 .../generate_until/strange_stories.yaml       |  4 +
 .../bigbench/generate_until/strategyqa.yaml   |  4 +
 .../sufficient_information.yaml               |  4 +
 .../bigbench/generate_until/suicide_risk.yaml |  4 +
 .../swahili_english_proverbs.yaml             |  4 +
 .../swedish_to_german_proverbs.yaml           |  4 +
 .../generate_until/symbol_interpretation.yaml |  4 +
 .../generate_until/temporal_sequences.yaml    |  4 +
 .../tasks/bigbench/generate_until/tense.yaml  |  4 +
 .../bigbench/generate_until/timedial.yaml     |  4 +
 .../bigbench/generate_until/topical_chat.yaml |  4 +
 .../tracking_shuffled_objects.yaml            |  4 +
 .../generate_until/understanding_fables.yaml  |  4 +
 .../generate_until/undo_permutation.yaml      |  4 +
 .../generate_until/unit_conversion.yaml       |  4 +
 .../generate_until/unit_interpretation.yaml   |  4 +
 .../unnatural_in_context_learning.yaml        |  4 +
 .../vitaminc_fact_verification.yaml           |  4 +
 .../generate_until/what_is_the_tao.yaml       |  4 +
 .../generate_until/which_wiki_edit.yaml       |  4 +
 .../bigbench/generate_until/winowhy.yaml      |  4 +
 .../bigbench/generate_until/word_sorting.yaml |  4 +
 .../generate_until/word_unscrambling.yaml     |  4 +
 .../abstract_narrative_understanding.yaml     |  4 -
 .../bigbench/greedy_until/anachronisms.yaml   |  4 -
 .../greedy_until/analogical_similarity.yaml   |  4 -
 .../greedy_until/analytic_entailment.yaml     |  4 -
 .../bigbench/greedy_until/arithmetic.yaml     |  4 -
 .../greedy_until/ascii_word_recognition.yaml  |  4 -
 .../greedy_until/authorship_verification.yaml |  4 -
 .../greedy_until/auto_categorization.yaml     |  4 -
 .../bigbench/greedy_until/auto_debugging.yaml |  4 -
 .../bigbench/greedy_until/bbq_lite_json.yaml  |  4 -
 .../bridging_anaphora_resolution_barqa.yaml   |  4 -
 .../greedy_until/causal_judgment.yaml         |  4 -
 .../greedy_until/cause_and_effect.yaml        |  4 -
 .../greedy_until/checkmate_in_one.yaml        |  4 -
 .../greedy_until/chess_state_tracking.yaml    |  4 -
 .../chinese_remainder_theorem.yaml            |  4 -
 .../greedy_until/cifar10_classification.yaml  |  4 -
 .../greedy_until/code_line_description.yaml   |  4 -
 .../bigbench/greedy_until/codenames.yaml      |  4 -
 .../tasks/bigbench/greedy_until/color.yaml    |  4 -
 .../greedy_until/common_morpheme.yaml         |  4 -
 .../greedy_until/conceptual_combinations.yaml |  4 -
 .../greedy_until/conlang_translation.yaml     |  4 -
 ...extual_parametric_knowledge_conflicts.yaml |  4 -
 .../bigbench/greedy_until/crash_blossom.yaml  |  4 -
 .../tasks/bigbench/greedy_until/crass_ai.yaml |  4 -
 .../greedy_until/cryobiology_spanish.yaml     |  4 -
 .../bigbench/greedy_until/cryptonite.yaml     |  4 -
 .../bigbench/greedy_until/cs_algorithms.yaml  |  4 -
 .../greedy_until/dark_humor_detection.yaml    |  4 -
 .../greedy_until/date_understanding.yaml      |  4 -
 .../greedy_until/disambiguation_qa.yaml       |  4 -
 .../discourse_marker_prediction.yaml          |  4 -
 .../tasks/bigbench/greedy_until/disfl_qa.yaml |  4 -
 .../bigbench/greedy_until/dyck_languages.yaml |  4 -
 .../greedy_until/elementary_math_qa.yaml      |  4 -
 .../bigbench/greedy_until/emoji_movie.yaml    |  4 -
 .../emojis_emotion_prediction.yaml            |  4 -
 .../greedy_until/empirical_judgments.yaml     |  4 -
 .../greedy_until/english_proverbs.yaml        |  4 -
 .../english_russian_proverbs.yaml             |  4 -
 .../greedy_until/entailed_polarity.yaml       |  4 -
 .../greedy_until/entailed_polarity_hindi.yaml |  4 -
 .../greedy_until/epistemic_reasoning.yaml     |  4 -
 .../evaluating_information_essentiality.yaml  |  4 -
 .../bigbench/greedy_until/fact_checker.yaml   |  4 -
 .../greedy_until/fantasy_reasoning.yaml       |  4 -
 .../bigbench/greedy_until/few_shot_nlg.yaml   |  4 -
 .../figure_of_speech_detection.yaml           |  4 -
 .../formal_fallacies_syllogisms_negation.yaml |  4 -
 lm_eval/tasks/bigbench/greedy_until/gem.yaml  |  4 -
 .../gender_inclusive_sentences_german.yaml    |  4 -
 .../greedy_until/general_knowledge.yaml       |  4 -
 .../greedy_until/geometric_shapes.yaml        |  4 -
 .../greedy_until/goal_step_wikihow.yaml       |  4 -
 .../gre_reading_comprehension.yaml            |  4 -
 .../bigbench/greedy_until/hhh_alignment.yaml  |  4 -
 .../hindi_question_answering.yaml             |  4 -
 .../greedy_until/hindu_knowledge.yaml         |  4 -
 .../greedy_until/hinglish_toxicity.yaml       |  4 -
 .../greedy_until/human_organs_senses.yaml     |  4 -
 .../bigbench/greedy_until/hyperbaton.yaml     |  4 -
 .../greedy_until/identify_math_theorems.yaml  |  4 -
 .../greedy_until/identify_odd_metaphor.yaml   |  4 -
 .../bigbench/greedy_until/implicatures.yaml   |  4 -
 .../greedy_until/implicit_relations.yaml      |  4 -
 .../greedy_until/intent_recognition.yaml      |  4 -
 .../international_phonetic_alphabet_nli.yaml  |  4 -
 ...ional_phonetic_alphabet_transliterate.yaml |  4 -
 .../greedy_until/intersect_geometry.yaml      |  4 -
 .../greedy_until/irony_identification.yaml    |  4 -
 .../bigbench/greedy_until/kanji_ascii.yaml    |  4 -
 .../tasks/bigbench/greedy_until/kannada.yaml  |  4 -
 .../bigbench/greedy_until/key_value_maps.yaml |  4 -
 .../bigbench/greedy_until/known_unknowns.yaml |  4 -
 .../bigbench/greedy_until/language_games.yaml |  4 -
 .../greedy_until/language_identification.yaml |  4 -
 .../greedy_until/linguistic_mappings.yaml     |  4 -
 .../greedy_until/linguistics_puzzles.yaml     |  4 -
 .../bigbench/greedy_until/list_functions.yaml |  4 -
 .../greedy_until/logic_grid_puzzle.yaml       |  4 -
 .../bigbench/greedy_until/logical_args.yaml   |  4 -
 .../greedy_until/logical_deduction.yaml       |  4 -
 .../logical_fallacy_detection.yaml            |  4 -
 .../greedy_until/logical_sequence.yaml        |  4 -
 .../greedy_until/mathematical_induction.yaml  |  4 -
 .../bigbench/greedy_until/matrixshapes.yaml   |  4 -
 .../greedy_until/metaphor_boolean.yaml        |  4 -
 .../greedy_until/metaphor_understanding.yaml  |  4 -
 .../greedy_until/minute_mysteries_qa.yaml     |  4 -
 .../bigbench/greedy_until/misconceptions.yaml |  4 -
 .../greedy_until/misconceptions_russian.yaml  |  4 -
 .../bigbench/greedy_until/mnist_ascii.yaml    |  4 -
 .../greedy_until/modified_arithmetic.yaml     |  4 -
 .../greedy_until/moral_permissibility.yaml    |  4 -
 .../movie_dialog_same_or_different.yaml       |  4 -
 .../greedy_until/movie_recommendation.yaml    |  4 -
 .../greedy_until/mult_data_wrangling.yaml     |  4 -
 .../tasks/bigbench/greedy_until/multiemo.yaml |  4 -
 .../greedy_until/natural_instructions.yaml    |  4 -
 .../tasks/bigbench/greedy_until/navigate.yaml |  4 -
 .../greedy_until/nonsense_words_grammar.yaml  |  4 -
 .../bigbench/greedy_until/novel_concepts.yaml |  4 -
 .../greedy_until/object_counting.yaml         |  4 -
 .../bigbench/greedy_until/odd_one_out.yaml    |  4 -
 .../bigbench/greedy_until/operators.yaml      |  4 -
 .../greedy_until/paragraph_segmentation.yaml  |  4 -
 .../bigbench/greedy_until/parsinlu_qa.yaml    |  4 -
 .../parsinlu_reading_comprehension.yaml       |  4 -
 .../greedy_until/penguins_in_a_table.yaml     |  4 -
 .../greedy_until/periodic_elements.yaml       |  4 -
 .../bigbench/greedy_until/persian_idioms.yaml |  4 -
 .../greedy_until/phrase_relatedness.yaml      |  4 -
 .../greedy_until/physical_intuition.yaml      |  4 -
 .../tasks/bigbench/greedy_until/physics.yaml  |  4 -
 .../greedy_until/physics_questions.yaml       |  4 -
 .../play_dialog_same_or_different.yaml        |  4 -
 .../polish_sequence_labeling.yaml             |  4 -
 .../greedy_until/presuppositions_as_nli.yaml  |  4 -
 .../bigbench/greedy_until/qa_wikidata.yaml    |  4 -
 .../greedy_until/question_selection.yaml      |  4 -
 .../greedy_until/real_or_fake_text.yaml       |  4 -
 .../reasoning_about_colored_objects.yaml      |  4 -
 .../greedy_until/repeat_copy_logic.yaml       |  4 -
 .../tasks/bigbench/greedy_until/rephrase.yaml |  4 -
 .../bigbench/greedy_until/riddle_sense.yaml   |  4 -
 .../bigbench/greedy_until/ruin_names.yaml     |  4 -
 .../salient_translation_error_detection.yaml  |  4 -
 .../scientific_press_release.yaml             |  4 -
 .../semantic_parsing_in_context_sparc.yaml    |  4 -
 .../greedy_until/semantic_parsing_spider.yaml |  4 -
 .../greedy_until/sentence_ambiguity.yaml      |  4 -
 .../similarities_abstraction.yaml             |  4 -
 .../greedy_until/simp_turing_concept.yaml     |  4 -
 .../greedy_until/simple_arithmetic_json.yaml  |  4 -
 ...imple_arithmetic_json_multiple_choice.yaml |  4 -
 .../simple_arithmetic_json_subtasks.yaml      |  4 -
 ...mple_arithmetic_multiple_targets_json.yaml |  4 -
 .../simple_ethical_questions.yaml             |  4 -
 .../greedy_until/simple_text_editing.yaml     |  4 -
 .../tasks/bigbench/greedy_until/snarks.yaml   |  4 -
 .../bigbench/greedy_until/social_iqa.yaml     |  4 -
 .../bigbench/greedy_until/social_support.yaml |  4 -
 .../greedy_until/sports_understanding.yaml    |  4 -
 .../greedy_until/strange_stories.yaml         |  4 -
 .../bigbench/greedy_until/strategyqa.yaml     |  4 -
 .../greedy_until/sufficient_information.yaml  |  4 -
 .../bigbench/greedy_until/suicide_risk.yaml   |  4 -
 .../swahili_english_proverbs.yaml             |  4 -
 .../swedish_to_german_proverbs.yaml           |  4 -
 .../greedy_until/symbol_interpretation.yaml   |  4 -
 .../greedy_until/temporal_sequences.yaml      |  4 -
 .../tasks/bigbench/greedy_until/tense.yaml    |  4 -
 .../tasks/bigbench/greedy_until/timedial.yaml |  4 -
 .../bigbench/greedy_until/topical_chat.yaml   |  4 -
 .../tracking_shuffled_objects.yaml            |  4 -
 .../greedy_until/understanding_fables.yaml    |  4 -
 .../greedy_until/undo_permutation.yaml        |  4 -
 .../greedy_until/unit_conversion.yaml         |  4 -
 .../greedy_until/unit_interpretation.yaml     |  4 -
 .../unnatural_in_context_learning.yaml        |  4 -
 .../vitaminc_fact_verification.yaml           |  4 -
 .../greedy_until/what_is_the_tao.yaml         |  4 -
 .../greedy_until/which_wiki_edit.yaml         |  4 -
 .../tasks/bigbench/greedy_until/winowhy.yaml  |  4 -
 .../bigbench/greedy_until/word_sorting.yaml   |  4 -
 .../greedy_until/word_unscrambling.yaml       |  4 -
 .../tasks/bigbench/greedy_until_template_yaml |  2 +-
 lm_eval/tasks/code_x_glue/code-text/go.yaml   |  2 +-
 lm_eval/tasks/code_x_glue/code-text/java.yaml |  2 +-
 .../code_x_glue/code-text/javascript.yaml     |  2 +-
 lm_eval/tasks/code_x_glue/code-text/php.yaml  |  2 +-
 .../tasks/code_x_glue/code-text/python.yaml   |  2 +-
 lm_eval/tasks/code_x_glue/code-text/ruby.yaml |  2 +-
 lm_eval/tasks/coqa/default.yaml               |  2 +-
 lm_eval/tasks/drop/default.yaml               |  2 +-
 lm_eval/tasks/gsm8k/gsm8k-cot.yaml            |  2 +-
 lm_eval/tasks/gsm8k/gsm8k.yaml                |  2 +-
 lm_eval/tasks/logiqa2/logieval.yaml           |  2 +-
 lm_eval/tasks/mgsm/direct/direct_yaml         |  2 +-
 lm_eval/tasks/mgsm/en_cot/cot_yaml            |  2 +-
 lm_eval/tasks/mgsm/native_cot/cot_yaml        |  2 +-
 lm_eval/tasks/minerva_math/README.md          |  2 +-
 .../minerva_math/minerva_math_algebra.yaml    |  2 +-
 .../_mmlu_flan_cot_fewshot_template_yaml      |  2 +-
 .../_mmlu_flan_generative_template_yaml       |  2 +-
 .../_mmlu_flan_generative_template_yaml       |  2 +-
 lm_eval/tasks/nq_open/nq_open.yaml            |  2 +-
 lm_eval/tasks/polemo2/polemo2_in.yaml         |  2 +-
 lm_eval/tasks/qasper/freeform.yaml            |  2 +-
 lm_eval/tasks/squadv2/default.yaml            | 16 +---
 lm_eval/tasks/super_glue/boolq/seq2seq.yaml   |  2 +-
 lm_eval/tasks/super_glue/boolq/t5-prompt.yaml |  2 +-
 lm_eval/tasks/super_glue/cb/t5-prompt.yaml    |  2 +-
 lm_eval/tasks/super_glue/copa/t5-prompt.yaml  |  2 +-
 .../tasks/super_glue/multirc/t5-prompt.yaml   |  2 +-
 .../tasks/super_glue/record/t5-prompt.yaml    |  2 +-
 lm_eval/tasks/super_glue/rte/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/super_glue/wic/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/super_glue/wsc/t5-prompt.yaml   |  2 +-
 .../tasks/translation/iwslt2017_ar-en.yaml    |  2 +-
 .../tasks/translation/iwslt2017_en-ar.yaml    |  2 +-
 lm_eval/tasks/translation/utils.py            |  2 +-
 lm_eval/tasks/translation/wmt14_en-fr.yaml    |  2 +-
 lm_eval/tasks/translation/wmt14_fr-en.yaml    |  2 +-
 lm_eval/tasks/translation/wmt16_de-en.yaml    |  2 +-
 lm_eval/tasks/translation/wmt16_en-de.yaml    |  2 +-
 lm_eval/tasks/translation/wmt16_en-ro.yaml    |  2 +-
 lm_eval/tasks/translation/wmt16_ro-en.yaml    |  2 +-
 lm_eval/tasks/translation/wmt_common_yaml     |  2 +-
 lm_eval/tasks/triviaqa/default.yaml           |  2 +-
 lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml  |  2 +-
 lm_eval/tasks/unscramble/anagrams1.yaml       |  2 +-
 lm_eval/tasks/unscramble/anagrams2.yaml       |  2 +-
 lm_eval/tasks/unscramble/cycle_letters.yaml   |  2 +-
 .../tasks/unscramble/random_insertion.yaml    |  2 +-
 lm_eval/tasks/unscramble/reversed_words.yaml  |  2 +-
 lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml    |  2 +-
 403 files changed, 766 insertions(+), 854 deletions(-)
 delete mode 100644 lm_eval/benchmarks/__init__.py
 rename lm_eval/{ => tasks}/benchmarks/minerva_math.yaml (100%)
 create mode 100644 lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/code_line_description.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/codenames.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/color.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/crass_ai.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/cryptonite.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/date_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/fact_checker.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/gem.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/implicatures.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/irony_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/kannada.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/language_games.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/language_identification.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/list_functions.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/logical_args.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/misconceptions.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/multiemo.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/navigate.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/object_counting.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/operators.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/physics.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/physics_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/question_selection.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/rephrase.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/ruin_names.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/snarks.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/social_iqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/social_support.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/strange_stories.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/strategyqa.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/tense.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/timedial.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/topical_chat.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/winowhy.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/word_sorting.yaml
 create mode 100644 lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/codenames.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/color.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/gem.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/kannada.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/language_games.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/navigate.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/operators.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/physics.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/snarks.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/social_support.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/tense.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/timedial.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
 delete mode 100644 lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index d0f7d14b..32813dec 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [
     "loglikelihood",
     "multiple_choice",
     "loglikelihood_rolling",
-    "greedy_until",
+    "generate_until",
 ]
 
 
@@ -80,7 +80,7 @@ class TaskConfig(dict):
     num_fewshot: int = 0
     # scoring options
     metric_list: list = None
-    output_type: str = "greedy_until"
+    output_type: str = "generate_until"
     generation_kwargs: dict = None
     repeats: int = 1
     filter_list: Union[str, list] = None
@@ -97,11 +97,11 @@ class TaskConfig(dict):
             self.dataset_path = inspect.getfile(import_module(self.dataset_path))
 
         if self.generation_kwargs is not None:
-            if self.output_type != "greedy_until":
+            if self.output_type != "generate_until":
                 eval_logger.warning(
-                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: greedy_until`!"
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
                 )
-                assert self.output_type != "greedy_until"
+                assert self.output_type != "generate_until"
 
             if "temperature" in self.generation_kwargs:
                 self.generation_kwargs["temperature"] = float(
@@ -111,7 +111,7 @@ class TaskConfig(dict):
             if "until" not in self.generation_kwargs:
                 self.generation_kwargs["until"] = [self.fewshot_delimiter]
         else:
-            if self.output_type == "greedy_until":
+            if self.output_type == "generate_until":
                 # ensure that we greedily generate in absence of explicit arguments otherwise
                 self.generation_kwargs = {
                     "until": None
@@ -958,7 +958,7 @@ class ConfigurableTask(Task):
                 )
             return request_list
 
-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
             arguments = (ctx, self.config.generation_kwargs)
 
         return Instance(
@@ -1070,7 +1070,7 @@ class ConfigurableTask(Task):
                 acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
                 result_dict["acc_mutual_info"] = acc_mutual_info
 
-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
             gold = self.doc_to_target(doc)
             result = results[0]
             if self.config.doc_to_choice is not None:
@@ -1134,7 +1134,7 @@ class ConfigurableTask(Task):
         else:
             raise ValueError(
                 f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
             )
 
         return result_dict
diff --git a/lm_eval/benchmarks/__init__.py b/lm_eval/benchmarks/__init__.py
deleted file mode 100644
index e87ad788..00000000
--- a/lm_eval/benchmarks/__init__.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-import yaml
-
-from lm_eval import utils
-from lm_eval.tasks import register_configurable_task, check_prompt_config
-from lm_eval.logger import eval_logger
-from lm_eval.api.registry import (
-    TASK_REGISTRY,
-    GROUP_REGISTRY,
-    ALL_TASKS,
-)
-
-
-def include_benchmarks(task_dir: str) -> None:
-    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    try:
-                        benchmark_path = os.path.join(root, f)
-
-                        with open(benchmark_path, "rb") as file:
-                            yaml_config = yaml.full_load(file)
-
-                        if "prompts" in yaml_config:
-                            continue  # Skip it
-
-                        assert "group" in yaml_config
-                        group = yaml_config["group"]
-                        all_task_list = yaml_config["task"]
-                        config_list = [
-                            task for task in all_task_list if type(task) != str
-                        ]
-                        task_list = [
-                            task for task in all_task_list if type(task) == str
-                        ]
-
-                        for task_config in config_list:
-                            yaml_dir = os.path.dirname(benchmark_path)
-                            task_config = utils.load_yaml_config(
-                                yaml_config=task_config, yaml_dir=yaml_dir
-                            )
-                            if "use_prompt" in task_config:
-                                if "yaml" in task_config["use_prompt"]:
-                                    task_config["use_prompt"] = os.path.join(
-                                        root, task_config["use_prompt"]
-                                    )
-
-                            var_configs = check_prompt_config(
-                                {
-                                    **task_config,
-                                    **{"group": group},
-                                }
-                            )
-                            for config in var_configs:
-                                register_configurable_task(config)
-
-                        task_names = utils.pattern_match(task_list, ALL_TASKS)
-                        for task in task_names:
-                            if task in TASK_REGISTRY:
-                                if group in GROUP_REGISTRY:
-                                    GROUP_REGISTRY[group].append(task)
-                                else:
-                                    GROUP_REGISTRY[group] = [task]
-                                    ALL_TASKS.add(group)
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load benchmark in\n"
-                            f"                                 {benchmark_path}\n"
-                            "                                 Benchmark will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-
-
-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_benchmarks(task_dir)
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index 953ea913..be144b16 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
     def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
         raise NotImplementedError("No support for logits.")
 
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         if not requests:
             return []
 
@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
                 )
                 res.append(response)
 
-                self.cache_hook.add_partial("greedy_until", request, response)
+                self.cache_hook.add_partial("generate_until", request, response)
             except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
                 eval_logger.critical(f"Server unreachable: {e.__cause__}")
                 break
@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
 
     def loglikelihood(self, requests):
diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py
index 0264e763..b13a3900 100644
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -20,7 +20,7 @@ class DummyLM(LM):
 
         return res
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = []
 
         for ctx, _ in requests:
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index d4b4c9b6..0a2519e6 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -813,7 +813,7 @@ class HFLM(LM):
 
         return re_ord.get_original(res)
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = defaultdict(list)
         re_ords = {}
 
@@ -930,7 +930,7 @@ class HFLM(LM):
                     res[key].append(s)
 
                     self.cache_hook.add_partial(
-                        "greedy_until", (context, gen_kwargs), s
+                        "generate_until", (context, gen_kwargs), s
                     )
                     pbar.update(1)
             # reorder this group of results back to original unsorted form
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index eb05dd4c..1a06d85a 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
         return re_ord.get_original(res)
 
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         if not requests:
             return []
         res = []
@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):
 
                 # partial caching
                 self.cache_hook.add_partial(
-                    "greedy_until", (context, {"until": until_}), s
+                    "generate_until", (context, {"until": until_}), s
                 )
 
                 res.append(s)
@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
 
     def loglikelihood_rolling(self, requests) -> List[float]:
diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py
index a8fcfb9c..379f11b9 100644
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -58,7 +58,7 @@ class TextSynthLM(LM):
 
     @property
     def eot_token_id(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     @property
@@ -72,20 +72,20 @@ class TextSynthLM(LM):
 
     @property
     def batch_size(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     @property
     def device(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def tok_encode(self, string: str):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def tok_decode(self, tokens):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def loglikelihood(self, requests):
@@ -122,7 +122,7 @@ class TextSynthLM(LM):
             "input tokenization support from TextSynth."
         )
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         if not requests:
             return []
 
@@ -146,7 +146,7 @@ class TextSynthLM(LM):
                 s = resp["text"]
                 res.append(s)
 
-                self.cache_hook.add_partial("greedy_until", (inp, request_args), s)
+                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
             else:
                 logger.error(
                     f"The following response does not contain generated `text`. "
@@ -160,5 +160,5 @@ class TextSynthLM(LM):
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 0b124a67..026f52e4 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -98,7 +98,7 @@ def check_prompt_config(
                             ]
                         )
                     },
-                    **{"output_type": "greedy_until"},
+                    **{"output_type": "generate_until"},
                 }
             )
     else:
diff --git a/lm_eval/tasks/babi/babi.yaml b/lm_eval/tasks/babi/babi.yaml
index 1b10cc00..5181b2a1 100644
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -1,7 +1,7 @@
 task: babi
 dataset_path: Muennighoff/babi
 dataset_name: null
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: valid
 test_split: test
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index 2e2e8bc9..b96dd712 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_cot_fewshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index 7ccf3699..8bb0f2b7 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_cot_zeroshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
index 89e5de29..93503989 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_fewshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
index 66dbf369..cd5bafe8 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_zeroshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
index cbd40849..6e460a0e 100644
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
@@ -1,5 +1,5 @@
 group: flan-cot
-output_type: greedy_until
+output_type: generate_until
 validation_split: validation
 doc_to_target: "{{answer}}"
 metric_list:
diff --git a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
index e09daca2..f5050a49 100644
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -1,4 +1,4 @@
-output_type: greedy_until
+output_type: generate_until
 validation_split: validation
 metric_list:
   - metric: exact_match
diff --git a/lm_eval/benchmarks/minerva_math.yaml b/lm_eval/tasks/benchmarks/minerva_math.yaml
similarity index 100%
rename from lm_eval/benchmarks/minerva_math.yaml
rename to lm_eval/tasks/benchmarks/minerva_math.yaml
diff --git a/lm_eval/tasks/benchmarks/t0_eval.yaml b/lm_eval/tasks/benchmarks/t0_eval.yaml
index 788122e9..27e7adc4 100644
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
@@ -6,7 +6,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -19,7 +19,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -32,7 +32,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -44,7 +44,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -56,7 +56,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r1
     validation_split: dev_r1
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -68,7 +68,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r2
     validation_split: dev_r2
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -80,7 +80,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r3
     validation_split: dev_r3
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -93,7 +93,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -105,7 +105,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -118,7 +118,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index 00a8799e..fa8619f4 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -175,8 +175,8 @@ all_subtasks = [
 def main() -> None:
 
     for path, task_type in zip(
-        ["multiple_choice", "greedy_until"],
-        ["multiple_choice_template_yaml", "greedy_until_template_yaml"],
+        ["multiple_choice", "generate_until"],
+        ["multiple_choice_template_yaml", "generate_until_template_yaml"],
     ):
         os.makedirs(path, exist_ok=True)
         for task in all_subtasks:
diff --git a/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
new file mode 100644
index 00000000..dce5238b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_abstract_narrative_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml b/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
new file mode 100644
index 00000000..83136198
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: anachronisms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_anachronisms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
new file mode 100644
index 00000000..5cc6550a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analogical_similarity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analogical_similarity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
new file mode 100644
index 00000000..4ae5cfe9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analytic_entailment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analytic_entailment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml b/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
new file mode 100644
index 00000000..d6ae791f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: arithmetic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_arithmetic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
new file mode 100644
index 00000000..60eaa0be
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ascii_word_recognition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_ascii_word_recognition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
new file mode 100644
index 00000000..3d7510df
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: authorship_verification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_authorship_verification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
new file mode 100644
index 00000000..d90a0e7c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_categorization_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_categorization_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
new file mode 100644
index 00000000..d8802c1c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_debugging_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_debugging_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
new file mode 100644
index 00000000..6812f699
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bbq_lite_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_bbq_lite_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml
new file mode 100644
index 00000000..28e7309f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bridging_anaphora_resolution_barqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_bridging_anaphora_resolution_barqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml
new file mode 100644
index 00000000..1e165680
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: causal_judgment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_causal_judgment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml
new file mode 100644
index 00000000..c34bfdc2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cause_and_effect_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cause_and_effect_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml
new file mode 100644
index 00000000..e0736f96
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: checkmate_in_one_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_checkmate_in_one_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml
new file mode 100644
index 00000000..8b3dde85
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chess_state_tracking_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_chess_state_tracking_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml
new file mode 100644
index 00000000..872e809b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chinese_remainder_theorem_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_chinese_remainder_theorem_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml
new file mode 100644
index 00000000..1a3b08ca
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cifar10_classification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cifar10_classification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml b/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml
new file mode 100644
index 00000000..4bd83353
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: code_line_description_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_code_line_description_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/codenames.yaml b/lm_eval/tasks/bigbench/generate_until/codenames.yaml
new file mode 100644
index 00000000..e71510b4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/codenames.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: codenames_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_codenames_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/color.yaml b/lm_eval/tasks/bigbench/generate_until/color.yaml
new file mode 100644
index 00000000..18793a99
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/color.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: color_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_color_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml
new file mode 100644
index 00000000..09a8b9f4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: common_morpheme_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_common_morpheme_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml
new file mode 100644
index 00000000..b36c1d5c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conceptual_combinations_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_conceptual_combinations_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml
new file mode 100644
index 00000000..ec9cccc8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conlang_translation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_conlang_translation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml
new file mode 100644
index 00000000..e4da8946
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_contextual_parametric_knowledge_conflicts_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml
new file mode 100644
index 00000000..3b551e5d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crash_blossom_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_crash_blossom_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml b/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml
new file mode 100644
index 00000000..a65d1c33
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crass_ai_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_crass_ai_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml
new file mode 100644
index 00000000..5fc59ee2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryobiology_spanish_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cryobiology_spanish_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml b/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml
new file mode 100644
index 00000000..3393c368
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryptonite_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cryptonite_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml
new file mode 100644
index 00000000..938fc4af
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cs_algorithms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cs_algorithms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml
new file mode 100644
index 00000000..f13ec2a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dark_humor_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_dark_humor_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml
new file mode 100644
index 00000000..0fdca6ab
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: date_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_date_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml
new file mode 100644
index 00000000..b671d715
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disambiguation_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_disambiguation_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml
new file mode 100644
index 00000000..30182d9d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: discourse_marker_prediction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_discourse_marker_prediction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml
new file mode 100644
index 00000000..4c6b9567
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disfl_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_disfl_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml
new file mode 100644
index 00000000..814a95de
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dyck_languages_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_dyck_languages_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml
new file mode 100644
index 00000000..9fe807bc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: elementary_math_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_elementary_math_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml
new file mode 100644
index 00000000..af958389
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emoji_movie_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_emoji_movie_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml
new file mode 100644
index 00000000..3eafb819
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emojis_emotion_prediction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_emojis_emotion_prediction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml
new file mode 100644
index 00000000..1b26cbee
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: empirical_judgments_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_empirical_judgments_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml
new file mode 100644
index 00000000..cdd014d9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_english_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml
new file mode 100644
index 00000000..4e6da1e0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_russian_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_english_russian_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml
new file mode 100644
index 00000000..cb2ecba0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_entailed_polarity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml
new file mode 100644
index 00000000..aba850d3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_hindi_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_entailed_polarity_hindi_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml
new file mode 100644
index 00000000..f080bcf3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: epistemic_reasoning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_epistemic_reasoning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml
new file mode 100644
index 00000000..b640b943
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: evaluating_information_essentiality_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_evaluating_information_essentiality_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml b/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml
new file mode 100644
index 00000000..62dd5197
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fact_checker_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_fact_checker_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml
new file mode 100644
index 00000000..b94f4c05
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fantasy_reasoning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_fantasy_reasoning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml
new file mode 100644
index 00000000..718837f1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: few_shot_nlg_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_few_shot_nlg_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml
new file mode 100644
index 00000000..ffbb5f60
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: figure_of_speech_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_figure_of_speech_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml
new file mode 100644
index 00000000..d3afc0ed
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: formal_fallacies_syllogisms_negation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_formal_fallacies_syllogisms_negation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gem.yaml b/lm_eval/tasks/bigbench/generate_until/gem.yaml
new file mode 100644
index 00000000..f59f2878
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gem_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gem_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml
new file mode 100644
index 00000000..12dd01b8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gender_inclusive_sentences_german_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gender_inclusive_sentences_german_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml
new file mode 100644
index 00000000..1c0a2ea6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: general_knowledge_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_general_knowledge_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml
new file mode 100644
index 00000000..d586c3cb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: geometric_shapes_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_geometric_shapes_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml
new file mode 100644
index 00000000..22748246
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: goal_step_wikihow_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_goal_step_wikihow_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml
new file mode 100644
index 00000000..449b09c4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gre_reading_comprehension_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gre_reading_comprehension_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml
new file mode 100644
index 00000000..c5c437a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hhh_alignment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hhh_alignment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml
new file mode 100644
index 00000000..463450b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindi_question_answering_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hindi_question_answering_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml
new file mode 100644
index 00000000..7fef48a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindu_knowledge_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hindu_knowledge_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml
new file mode 100644
index 00000000..7ad63dda
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hinglish_toxicity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hinglish_toxicity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml
new file mode 100644
index 00000000..2334fd6d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: human_organs_senses_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_human_organs_senses_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml
new file mode 100644
index 00000000..1e428c2a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hyperbaton_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hyperbaton_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml
new file mode 100644
index 00000000..4d0028e0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_math_theorems_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_identify_math_theorems_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml
new file mode 100644
index 00000000..b4e1f9aa
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_odd_metaphor_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_identify_odd_metaphor_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/implicatures.yaml b/lm_eval/tasks/bigbench/generate_until/implicatures.yaml
new file mode 100644
index 00000000..cf19c32a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/implicatures.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicatures_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_implicatures_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml
new file mode 100644
index 00000000..361f0435
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicit_relations_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_implicit_relations_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml
new file mode 100644
index 00000000..0583a17e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intent_recognition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_intent_recognition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml
new file mode 100644
index 00000000..1497c780
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_nli_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_international_phonetic_alphabet_nli_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml
new file mode 100644
index 00000000..71ad3b9d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_transliterate_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_international_phonetic_alphabet_transliterate_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml
new file mode 100644
index 00000000..0f2868a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intersect_geometry_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_intersect_geometry_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml b/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml
new file mode 100644
index 00000000..556c5a62
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: irony_identification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_irony_identification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml
new file mode 100644
index 00000000..f9a8a5b8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kanji_ascii_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_kanji_ascii_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/kannada.yaml b/lm_eval/tasks/bigbench/generate_until/kannada.yaml
new file mode 100644
index 00000000..047e7049
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/kannada.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kannada_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_kannada_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml
new file mode 100644
index 00000000..3ea697d1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: key_value_maps_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_key_value_maps_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml
new file mode 100644
index 00000000..b1a8bb06
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: known_unknowns_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_known_unknowns_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/language_games.yaml b/lm_eval/tasks/bigbench/generate_until/language_games.yaml
new file mode 100644
index 00000000..56022300
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/language_games.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_games_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_language_games_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/language_identification.yaml b/lm_eval/tasks/bigbench/generate_until/language_identification.yaml
new file mode 100644
index 00000000..9cb7b274
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/language_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_identification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_language_identification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml
new file mode 100644
index 00000000..cc351ce1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistic_mappings_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_linguistic_mappings_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml
new file mode 100644
index 00000000..df8b729a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistics_puzzles_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_linguistics_puzzles_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/list_functions.yaml b/lm_eval/tasks/bigbench/generate_until/list_functions.yaml
new file mode 100644
index 00000000..658630ac
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/list_functions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: list_functions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_list_functions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml
new file mode 100644
index 00000000..aa8f2c2f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logic_grid_puzzle_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logic_grid_puzzle_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_args.yaml b/lm_eval/tasks/bigbench/generate_until/logical_args.yaml
new file mode 100644
index 00000000..e85c1429
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_args.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_args_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_args_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml
new file mode 100644
index 00000000..8fdaac7f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_deduction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_deduction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml
new file mode 100644
index 00000000..a74d11ea
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_fallacy_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_fallacy_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml
new file mode 100644
index 00000000..b55c057b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_sequence_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_sequence_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml
new file mode 100644
index 00000000..59e4fc3f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mathematical_induction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mathematical_induction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml
new file mode 100644
index 00000000..1a162eae
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: matrixshapes_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_matrixshapes_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml
new file mode 100644
index 00000000..28922b3f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_boolean_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_metaphor_boolean_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml
new file mode 100644
index 00000000..029a4c0a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_metaphor_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml
new file mode 100644
index 00000000..d453fd94
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: minute_mysteries_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_minute_mysteries_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml b/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml
new file mode 100644
index 00000000..f3375eb6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_misconceptions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml
new file mode 100644
index 00000000..a5e5e102
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_russian_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_misconceptions_russian_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml
new file mode 100644
index 00000000..db7ce738
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mnist_ascii_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mnist_ascii_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml
new file mode 100644
index 00000000..edbb2b34
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: modified_arithmetic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_modified_arithmetic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml
new file mode 100644
index 00000000..277bf69f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: moral_permissibility_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_moral_permissibility_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml
new file mode 100644
index 00000000..27cc6228
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_dialog_same_or_different_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_movie_dialog_same_or_different_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml
new file mode 100644
index 00000000..97c370ce
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_recommendation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_movie_recommendation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml
new file mode 100644
index 00000000..622c7ab1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mult_data_wrangling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mult_data_wrangling_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/multiemo.yaml b/lm_eval/tasks/bigbench/generate_until/multiemo.yaml
new file mode 100644
index 00000000..465ccd0c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/multiemo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: multiemo_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_multiemo_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml
new file mode 100644
index 00000000..9b77c895
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: natural_instructions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_natural_instructions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/navigate.yaml b/lm_eval/tasks/bigbench/generate_until/navigate.yaml
new file mode 100644
index 00000000..549ed370
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/navigate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: navigate_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_navigate_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml
new file mode 100644
index 00000000..0ed30902
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: nonsense_words_grammar_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_nonsense_words_grammar_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml
new file mode 100644
index 00000000..12f388f8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: novel_concepts_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_novel_concepts_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/object_counting.yaml b/lm_eval/tasks/bigbench/generate_until/object_counting.yaml
new file mode 100644
index 00000000..a9fc9569
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/object_counting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: object_counting_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_object_counting_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml
new file mode 100644
index 00000000..a58d7b5f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: odd_one_out_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_odd_one_out_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/operators.yaml b/lm_eval/tasks/bigbench/generate_until/operators.yaml
new file mode 100644
index 00000000..d6aaa8b6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/operators.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: operators_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_operators_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml
new file mode 100644
index 00000000..5f982c5d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: paragraph_segmentation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_paragraph_segmentation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml
new file mode 100644
index 00000000..552f8c60
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_parsinlu_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml
new file mode 100644
index 00000000..358184e1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_reading_comprehension_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_parsinlu_reading_comprehension_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml
new file mode 100644
index 00000000..6dc70030
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: penguins_in_a_table_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_penguins_in_a_table_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml
new file mode 100644
index 00000000..c5c96cec
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: periodic_elements_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_periodic_elements_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml
new file mode 100644
index 00000000..7e3aa0f4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: persian_idioms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_persian_idioms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml
new file mode 100644
index 00000000..037da053
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: phrase_relatedness_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_phrase_relatedness_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml
new file mode 100644
index 00000000..ecef1581
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physical_intuition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physical_intuition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physics.yaml b/lm_eval/tasks/bigbench/generate_until/physics.yaml
new file mode 100644
index 00000000..39bc786b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physics.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physics_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml b/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml
new file mode 100644
index 00000000..3fcfd477
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_questions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physics_questions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml
new file mode 100644
index 00000000..57b65cfd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: play_dialog_same_or_different_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_play_dialog_same_or_different_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml
new file mode 100644
index 00000000..23775493
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: polish_sequence_labeling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_polish_sequence_labeling_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml
new file mode 100644
index 00000000..70da2d74
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: presuppositions_as_nli_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_presuppositions_as_nli_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml
new file mode 100644
index 00000000..9fb5b230
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: qa_wikidata_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_qa_wikidata_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/question_selection.yaml b/lm_eval/tasks/bigbench/generate_until/question_selection.yaml
new file mode 100644
index 00000000..8e2321a8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/question_selection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: question_selection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_question_selection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml
new file mode 100644
index 00000000..948bfb0c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: real_or_fake_text_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_real_or_fake_text_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..0b371d6e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: reasoning_about_colored_objects_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_reasoning_about_colored_objects_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml
new file mode 100644
index 00000000..bd8cd4d8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: repeat_copy_logic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_repeat_copy_logic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/rephrase.yaml b/lm_eval/tasks/bigbench/generate_until/rephrase.yaml
new file mode 100644
index 00000000..16a337db
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/rephrase.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: rephrase_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_rephrase_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml
new file mode 100644
index 00000000..745cdb32
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: riddle_sense_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_riddle_sense_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml b/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml
new file mode 100644
index 00000000..e9ceddad
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ruin_names_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_ruin_names_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..4968e441
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: salient_translation_error_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_salient_translation_error_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml
new file mode 100644
index 00000000..122f66e7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: scientific_press_release_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_scientific_press_release_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml
new file mode 100644
index 00000000..276c997a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_in_context_sparc_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_semantic_parsing_in_context_sparc_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml
new file mode 100644
index 00000000..39307d92
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_spider_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_semantic_parsing_spider_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml
new file mode 100644
index 00000000..263b453f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sentence_ambiguity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sentence_ambiguity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml
new file mode 100644
index 00000000..c33b1c8b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: similarities_abstraction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_similarities_abstraction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml
new file mode 100644
index 00000000..6eb9cd87
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simp_turing_concept_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simp_turing_concept_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml
new file mode 100644
index 00000000..3ff5a1b1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml
new file mode 100644
index 00000000..8d130973
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_multiple_choice_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml
new file mode 100644
index 00000000..57052288
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_subtasks_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_subtasks_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml
new file mode 100644
index 00000000..393ec884
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_multiple_targets_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml
new file mode 100644
index 00000000..44960774
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_ethical_questions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_ethical_questions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml
new file mode 100644
index 00000000..d3310fa2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_text_editing_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_text_editing_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/snarks.yaml b/lm_eval/tasks/bigbench/generate_until/snarks.yaml
new file mode 100644
index 00000000..d362537a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/snarks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: snarks_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_snarks_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml b/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml
new file mode 100644
index 00000000..4ba7721d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_iqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_social_iqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/social_support.yaml b/lm_eval/tasks/bigbench/generate_until/social_support.yaml
new file mode 100644
index 00000000..dc00bb83
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/social_support.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_support_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_social_support_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml
new file mode 100644
index 00000000..474c08ae
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sports_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sports_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml b/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml
new file mode 100644
index 00000000..f5405d92
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strange_stories_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_strange_stories_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml b/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml
new file mode 100644
index 00000000..47c4b25c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strategyqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_strategyqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml
new file mode 100644
index 00000000..0705a250
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sufficient_information_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sufficient_information_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml
new file mode 100644
index 00000000..e276c4a0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: suicide_risk_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_suicide_risk_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml
new file mode 100644
index 00000000..c218adb3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swahili_english_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_swahili_english_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml
new file mode 100644
index 00000000..5a13d6f7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swedish_to_german_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_swedish_to_german_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml
new file mode 100644
index 00000000..cca33bf6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: symbol_interpretation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_symbol_interpretation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml
new file mode 100644
index 00000000..414dc51b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: temporal_sequences_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_temporal_sequences_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/tense.yaml b/lm_eval/tasks/bigbench/generate_until/tense.yaml
new file mode 100644
index 00000000..480b95ec
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/tense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tense_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_tense_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/timedial.yaml b/lm_eval/tasks/bigbench/generate_until/timedial.yaml
new file mode 100644
index 00000000..854d8642
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/timedial.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: timedial_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_timedial_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml b/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml
new file mode 100644
index 00000000..47a301cf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: topical_chat_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_topical_chat_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml
new file mode 100644
index 00000000..9c02866c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tracking_shuffled_objects_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_tracking_shuffled_objects_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml
new file mode 100644
index 00000000..9972f403
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: understanding_fables_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_understanding_fables_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml
new file mode 100644
index 00000000..3f0e914c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: undo_permutation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_undo_permutation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml
new file mode 100644
index 00000000..6f3747c4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_conversion_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unit_conversion_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml
new file mode 100644
index 00000000..34c882dc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_interpretation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unit_interpretation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml
new file mode 100644
index 00000000..deddb77d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unnatural_in_context_learning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unnatural_in_context_learning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml
new file mode 100644
index 00000000..6f2ad8d3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vitaminc_fact_verification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_vitaminc_fact_verification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml
new file mode 100644
index 00000000..3a1487ab
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: what_is_the_tao_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_what_is_the_tao_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml
new file mode 100644
index 00000000..bc05c377
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: which_wiki_edit_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_which_wiki_edit_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/winowhy.yaml b/lm_eval/tasks/bigbench/generate_until/winowhy.yaml
new file mode 100644
index 00000000..99ff22d9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/winowhy.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: winowhy_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_winowhy_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml b/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml
new file mode 100644
index 00000000..16be6060
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_sorting_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_word_sorting_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml
new file mode 100644
index 00000000..5632a79c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_unscrambling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_word_unscrambling_generate_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
deleted file mode 100644
index dd041fdd..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: abstract_narrative_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_abstract_narrative_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
deleted file mode 100644
index 9e723927..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: anachronisms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_anachronisms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
deleted file mode 100644
index 3d2e82b4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: analogical_similarity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_analogical_similarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
deleted file mode 100644
index a8425049..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: analytic_entailment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_analytic_entailment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
deleted file mode 100644
index be296b1b..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: arithmetic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
deleted file mode 100644
index d199e8a5..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: ascii_word_recognition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_ascii_word_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
deleted file mode 100644
index 65d8177c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: authorship_verification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_authorship_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
deleted file mode 100644
index 3ce36427..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: auto_categorization_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_auto_categorization_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
deleted file mode 100644
index e25bee24..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: auto_debugging_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_auto_debugging_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
deleted file mode 100644
index d1d45477..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: bbq_lite_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_bbq_lite_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
deleted file mode 100644
index a20da27f..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: bridging_anaphora_resolution_barqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_bridging_anaphora_resolution_barqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
deleted file mode 100644
index 2b9c89af..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: causal_judgment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_causal_judgment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
deleted file mode 100644
index 5dd23108..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cause_and_effect_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cause_and_effect_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
deleted file mode 100644
index 06681769..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: checkmate_in_one_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_checkmate_in_one_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
deleted file mode 100644
index 6a9a088e..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: chess_state_tracking_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_chess_state_tracking_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
deleted file mode 100644
index f3937088..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: chinese_remainder_theorem_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_chinese_remainder_theorem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
deleted file mode 100644
index 6bad6797..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cifar10_classification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cifar10_classification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
deleted file mode 100644
index de1f7829..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: code_line_description_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_code_line_description_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
deleted file mode 100644
index 83feca88..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: codenames_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_codenames_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/color.yaml b/lm_eval/tasks/bigbench/greedy_until/color.yaml
deleted file mode 100644
index 5aa9c1a9..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/color.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: color_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_color_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
deleted file mode 100644
index ec0fdc44..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: common_morpheme_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_common_morpheme_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
deleted file mode 100644
index 5eaba446..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: conceptual_combinations_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_conceptual_combinations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
deleted file mode 100644
index afae8184..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: conlang_translation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_conlang_translation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
deleted file mode 100644
index bb7eba64..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_contextual_parametric_knowledge_conflicts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
deleted file mode 100644
index ae7f6b9f..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: crash_blossom_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_crash_blossom_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
deleted file mode 100644
index 7d56bbc2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: crass_ai_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_crass_ai_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
deleted file mode 100644
index 37fd99ad..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cryobiology_spanish_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cryobiology_spanish_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
deleted file mode 100644
index 64577738..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cryptonite_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cryptonite_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
deleted file mode 100644
index 9279c295..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cs_algorithms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cs_algorithms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
deleted file mode 100644
index 014d57e6..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: dark_humor_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_dark_humor_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
deleted file mode 100644
index 999a7e71..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: date_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_date_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
deleted file mode 100644
index db25589d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: disambiguation_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_disambiguation_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
deleted file mode 100644
index ae8941e8..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: discourse_marker_prediction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_discourse_marker_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
deleted file mode 100644
index 0086850a..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: disfl_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_disfl_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
deleted file mode 100644
index e8de0093..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: dyck_languages_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_dyck_languages_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
deleted file mode 100644
index 55369151..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: elementary_math_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_elementary_math_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
deleted file mode 100644
index 4553ede7..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: emoji_movie_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_emoji_movie_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
deleted file mode 100644
index e570e24a..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: emojis_emotion_prediction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_emojis_emotion_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
deleted file mode 100644
index d4f2f3cf..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: empirical_judgments_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_empirical_judgments_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
deleted file mode 100644
index b7628796..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: english_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
deleted file mode 100644
index ea719e1d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: english_russian_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_english_russian_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
deleted file mode 100644
index e3d89fc2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: entailed_polarity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_entailed_polarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
deleted file mode 100644
index e416a059..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: entailed_polarity_hindi_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_entailed_polarity_hindi_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
deleted file mode 100644
index 8f8efc4e..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: epistemic_reasoning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_epistemic_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
deleted file mode 100644
index b35240c4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: evaluating_information_essentiality_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_evaluating_information_essentiality_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
deleted file mode 100644
index f83e4081..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: fact_checker_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_fact_checker_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
deleted file mode 100644
index ab38359d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: fantasy_reasoning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_fantasy_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
deleted file mode 100644
index bf1e33e0..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: few_shot_nlg_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_few_shot_nlg_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
deleted file mode 100644
index 184cd4e6..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: figure_of_speech_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_figure_of_speech_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
deleted file mode 100644
index cb1915b8..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: formal_fallacies_syllogisms_negation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_formal_fallacies_syllogisms_negation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gem.yaml b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
deleted file mode 100644
index aa43ca45..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/gem.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gem_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
deleted file mode 100644
index 6471e577..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gender_inclusive_sentences_german_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gender_inclusive_sentences_german_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
deleted file mode 100644
index 93a3f875..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: general_knowledge_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_general_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
deleted file mode 100644
index c3a5d9a7..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: geometric_shapes_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_geometric_shapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
deleted file mode 100644
index 6fd557d3..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: goal_step_wikihow_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_goal_step_wikihow_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
deleted file mode 100644
index c4416b10..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gre_reading_comprehension_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gre_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
deleted file mode 100644
index 4060824c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hhh_alignment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hhh_alignment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
deleted file mode 100644
index 5c4791b4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hindi_question_answering_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hindi_question_answering_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
deleted file mode 100644
index 040441f7..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hindu_knowledge_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hindu_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
deleted file mode 100644
index 0eb98e51..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hinglish_toxicity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hinglish_toxicity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
deleted file mode 100644
index c5541571..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: human_organs_senses_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_human_organs_senses_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
deleted file mode 100644
index 4368f4c9..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hyperbaton_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hyperbaton_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
deleted file mode 100644
index 2c08703e..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: identify_math_theorems_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_identify_math_theorems_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
deleted file mode 100644
index 9cb39d0d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: identify_odd_metaphor_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_identify_odd_metaphor_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
deleted file mode 100644
index e216762c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: implicatures_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_implicatures_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
deleted file mode 100644
index c7a82a10..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: implicit_relations_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_implicit_relations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
deleted file mode 100644
index 4839afa2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: intent_recognition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_intent_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
deleted file mode 100644
index 62643a46..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: international_phonetic_alphabet_nli_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_international_phonetic_alphabet_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
deleted file mode 100644
index 05feb4f5..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: international_phonetic_alphabet_transliterate_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_international_phonetic_alphabet_transliterate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
deleted file mode 100644
index 57745d23..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: intersect_geometry_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_intersect_geometry_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
deleted file mode 100644
index b49dfb44..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: irony_identification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_irony_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
deleted file mode 100644
index 293ff6c2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: kanji_ascii_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_kanji_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
deleted file mode 100644
index 00eeb32a..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: kannada_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_kannada_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
deleted file mode 100644
index d313e1ce..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: key_value_maps_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_key_value_maps_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
deleted file mode 100644
index d72e1d37..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: known_unknowns_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_known_unknowns_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
deleted file mode 100644
index 61e85b53..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: language_games_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_language_games_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
deleted file mode 100644
index 8db65637..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: language_identification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_language_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
deleted file mode 100644
index db6e9832..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: linguistic_mappings_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_linguistic_mappings_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
deleted file mode 100644
index 4e3981f4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: linguistics_puzzles_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_linguistics_puzzles_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
deleted file mode 100644
index 32afff69..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: list_functions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_list_functions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
deleted file mode 100644
index a1d1b5b1..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logic_grid_puzzle_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logic_grid_puzzle_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
deleted file mode 100644
index 201c04ae..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_args_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_args_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
deleted file mode 100644
index 1b77561d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_deduction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_deduction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
deleted file mode 100644
index af3e9ea4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_fallacy_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_fallacy_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
deleted file mode 100644
index 4d4ffe1d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_sequence_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_sequence_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
deleted file mode 100644
index 84d0f419..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mathematical_induction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mathematical_induction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
deleted file mode 100644
index 956aa5f0..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: matrixshapes_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_matrixshapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
deleted file mode 100644
index 7fd4e53c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: metaphor_boolean_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_metaphor_boolean_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
deleted file mode 100644
index 12b79d44..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: metaphor_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_metaphor_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
deleted file mode 100644
index 459aec57..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: minute_mysteries_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_minute_mysteries_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
deleted file mode 100644
index 25038ae3..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: misconceptions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_misconceptions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
deleted file mode 100644
index 676d94ea..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: misconceptions_russian_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_misconceptions_russian_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
deleted file mode 100644
index 19c9a82b..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mnist_ascii_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mnist_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
deleted file mode 100644
index 313b5b9d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: modified_arithmetic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_modified_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
deleted file mode 100644
index f478ed24..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: moral_permissibility_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_moral_permissibility_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
deleted file mode 100644
index 98e06e5d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: movie_dialog_same_or_different_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_movie_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
deleted file mode 100644
index 7cd021a4..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: movie_recommendation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_movie_recommendation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
deleted file mode 100644
index 92b84838..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mult_data_wrangling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mult_data_wrangling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
deleted file mode 100644
index ac4f9432..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: multiemo_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_multiemo_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
deleted file mode 100644
index 0b87004d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: natural_instructions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_natural_instructions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
deleted file mode 100644
index 85fd618b..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: navigate_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_navigate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
deleted file mode 100644
index 863b0a85..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: nonsense_words_grammar_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_nonsense_words_grammar_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
deleted file mode 100644
index b3b08806..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: novel_concepts_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_novel_concepts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
deleted file mode 100644
index fc0d6119..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: object_counting_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_object_counting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
deleted file mode 100644
index 90d0fd93..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: odd_one_out_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_odd_one_out_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/operators.yaml b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
deleted file mode 100644
index d4ad9b91..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/operators.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: operators_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_operators_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
deleted file mode 100644
index c661e1a7..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: paragraph_segmentation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_paragraph_segmentation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
deleted file mode 100644
index 4ea51e21..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: parsinlu_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_parsinlu_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
deleted file mode 100644
index 967741fd..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: parsinlu_reading_comprehension_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_parsinlu_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
deleted file mode 100644
index 5e59b741..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: penguins_in_a_table_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_penguins_in_a_table_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
deleted file mode 100644
index a7ed5a82..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: periodic_elements_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_periodic_elements_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
deleted file mode 100644
index 087d4688..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: persian_idioms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_persian_idioms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
deleted file mode 100644
index c2da5cce..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: phrase_relatedness_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_phrase_relatedness_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
deleted file mode 100644
index 1482fe65..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physical_intuition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physical_intuition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics.yaml b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
deleted file mode 100644
index 7fade7b3..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physics_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physics_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
deleted file mode 100644
index bf332361..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physics_questions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physics_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
deleted file mode 100644
index 1ddf7ca7..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: play_dialog_same_or_different_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_play_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
deleted file mode 100644
index 10c8bd98..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: polish_sequence_labeling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_polish_sequence_labeling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
deleted file mode 100644
index 66d0e5ea..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: presuppositions_as_nli_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_presuppositions_as_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
deleted file mode 100644
index 67240110..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: qa_wikidata_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_qa_wikidata_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
deleted file mode 100644
index 5652cb3f..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: question_selection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_question_selection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
deleted file mode 100644
index c206597b..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: real_or_fake_text_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_real_or_fake_text_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
deleted file mode 100644
index 8b1051e5..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: reasoning_about_colored_objects_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_reasoning_about_colored_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
deleted file mode 100644
index 279ecd01..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: repeat_copy_logic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_repeat_copy_logic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
deleted file mode 100644
index 90135638..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: rephrase_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_rephrase_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
deleted file mode 100644
index a11c167d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: riddle_sense_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_riddle_sense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
deleted file mode 100644
index 5074e010..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: ruin_names_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_ruin_names_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
deleted file mode 100644
index 7f2ce433..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: salient_translation_error_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_salient_translation_error_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
deleted file mode 100644
index 90071882..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: scientific_press_release_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_scientific_press_release_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
deleted file mode 100644
index 93ddccc2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: semantic_parsing_in_context_sparc_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_semantic_parsing_in_context_sparc_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
deleted file mode 100644
index cc590faf..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: semantic_parsing_spider_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_semantic_parsing_spider_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
deleted file mode 100644
index 6cbacb79..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sentence_ambiguity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sentence_ambiguity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
deleted file mode 100644
index 10e9a439..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: similarities_abstraction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_similarities_abstraction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
deleted file mode 100644
index a82b8226..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simp_turing_concept_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simp_turing_concept_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
deleted file mode 100644
index 8e0a207e..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
deleted file mode 100644
index df235325..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_multiple_choice_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
deleted file mode 100644
index 2f981fb0..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_subtasks_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_subtasks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
deleted file mode 100644
index 2bc6cf16..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_multiple_targets_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
deleted file mode 100644
index 77e45a58..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_ethical_questions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_ethical_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
deleted file mode 100644
index 1b485d5c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_text_editing_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_text_editing_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
deleted file mode 100644
index 9ccbda74..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: snarks_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_snarks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
deleted file mode 100644
index 9cbc5ec5..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: social_iqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_social_iqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
deleted file mode 100644
index bcc3a9d1..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: social_support_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_social_support_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
deleted file mode 100644
index 01082a10..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sports_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sports_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
deleted file mode 100644
index a0bf1c46..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: strange_stories_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_strange_stories_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
deleted file mode 100644
index 495d873f..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: strategyqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_strategyqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
deleted file mode 100644
index 3484952c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sufficient_information_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sufficient_information_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
deleted file mode 100644
index a8e980d5..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: suicide_risk_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_suicide_risk_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
deleted file mode 100644
index ff045534..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: swahili_english_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_swahili_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
deleted file mode 100644
index 8cbd401b..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: swedish_to_german_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_swedish_to_german_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
deleted file mode 100644
index 3fa4cdba..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: symbol_interpretation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_symbol_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
deleted file mode 100644
index c20300f8..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: temporal_sequences_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_temporal_sequences_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tense.yaml b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
deleted file mode 100644
index b1b5698d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/tense.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: tense_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_tense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
deleted file mode 100644
index d5f1950e..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: timedial_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_timedial_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
deleted file mode 100644
index 4ec83039..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: topical_chat_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_topical_chat_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
deleted file mode 100644
index 27024bee..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: tracking_shuffled_objects_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_tracking_shuffled_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
deleted file mode 100644
index f467652d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: understanding_fables_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_understanding_fables_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
deleted file mode 100644
index d91ff331..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: undo_permutation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_undo_permutation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
deleted file mode 100644
index a31929fb..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unit_conversion_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unit_conversion_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
deleted file mode 100644
index ca4c38be..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unit_interpretation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unit_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
deleted file mode 100644
index 1cc271d2..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unnatural_in_context_learning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unnatural_in_context_learning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
deleted file mode 100644
index 770e8500..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: vitaminc_fact_verification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_vitaminc_fact_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
deleted file mode 100644
index 8c60da65..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: what_is_the_tao_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_what_is_the_tao_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
deleted file mode 100644
index 4eda6d08..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: which_wiki_edit_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_which_wiki_edit_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
deleted file mode 100644
index e065c80c..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: winowhy_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_winowhy_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
deleted file mode 100644
index caa6f02d..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: word_sorting_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_sorting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
deleted file mode 100644
index 774aef15..00000000
--- a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: word_unscrambling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_unscrambling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
index 130500cc..ebce0377 100644
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ b/lm_eval/tasks/bigbench/greedy_until_template_yaml
@@ -1,6 +1,6 @@
 group: bigbench
 dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
-output_type: greedy_until
+output_type: generate_until
 dataset_kwargs:
   # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
   # subtask_name: null
diff --git a/lm_eval/tasks/code_x_glue/code-text/go.yaml b/lm_eval/tasks/code_x_glue/code-text/go.yaml
index 3a4033c6..f8670652 100644
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_go
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/java.yaml b/lm_eval/tasks/code_x_glue/code-text/java.yaml
index 141673c9..aaad034c 100644
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_java
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
index c537e50d..615fffe3 100644
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_javascript
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/php.yaml b/lm_eval/tasks/code_x_glue/code-text/php.yaml
index 9137bdaf..b71a7525 100644
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_php
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/python.yaml b/lm_eval/tasks/code_x_glue/code-text/python.yaml
index a98bfdba..301251b0 100644
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_python
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
index d6562d4c..6a3b4a5a 100644
--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_ruby
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml
index 5b891192..f928e1f7 100644
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -1,6 +1,6 @@
 task: coqa
 dataset_path: EleutherAI/coqa
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text
diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml
index 973fff7b..28560312 100644
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
@@ -1,6 +1,6 @@
 task: drop
 dataset_path: EleutherAI/drop
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index 6236e519..92555a57 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_cot
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
 Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index ebd1b4e1..124f708d 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_yaml
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test
diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml
index 7701426e..9f945be4 100644
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -1,7 +1,7 @@
 task: logieval
 dataset_path: baber/logiqa2
 dataset_name: logieval
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 # Instructions + {content}
diff --git a/lm_eval/tasks/mgsm/direct/direct_yaml b/lm_eval/tasks/mgsm/direct/direct_yaml
index 6eae3257..0833ff8a 100644
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/mgsm/en_cot/cot_yaml b/lm_eval/tasks/mgsm/en_cot/cot_yaml
index f5cf60d9..06308fd7 100644
--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/mgsm/native_cot/cot_yaml b/lm_eval/tasks/mgsm/native_cot/cot_yaml
index f5cf60d9..06308fd7 100644
--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/minerva_math/README.md b/lm_eval/tasks/minerva_math/README.md
index 7ca5d652..7bfb7d50 100644
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -37,7 +37,7 @@ Eprint = {arXiv:2206.14858},
 #### Groups
 
 - `math_word_problems`
-- `greedy_until`
+- `generate_until`
 
 #### Tasks
 
diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
index 3ad3802b..8aca7ad5 100644
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -4,7 +4,7 @@ task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math
 process_docs: !function utils.process_docs
 dataset_name: algebra
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 doc_to_text:  !function utils.doc_to_text
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index 8461b93a..4b54fb41 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
index 0666018b..37c95ce7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index b1ff96a8..49046d22 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 generation_kwargs:
diff --git a/lm_eval/tasks/nq_open/nq_open.yaml b/lm_eval/tasks/nq_open/nq_open.yaml
index 1a472151..69ff8dac 100644
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
@@ -1,6 +1,6 @@
 task: nq_open
 dataset_path: nq_open
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 description: "Answer these questions:\n"
diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml
index 4c2250f8..b7f7caf8 100644
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -3,7 +3,7 @@ group:
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
 dataset_name: klej-polemo2-in
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 test_split: test
diff --git a/lm_eval/tasks/qasper/freeform.yaml b/lm_eval/tasks/qasper/freeform.yaml
index 03324c3b..c65d0f7b 100644
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
@@ -1,7 +1,7 @@
 group: qasper
 task: qasper_freeform
 dataset_path: qasper
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs_freeform
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
index 2bb3029f..0f42bf54 100644
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
@@ -1,21 +1,9 @@
+include: _template_yaml
 task: squadv2
-dataset_path: squad_v2
-output_type: greedy_until
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
+output_type: generate_until
 generation_kwargs:
   until:
     - "\n"
-# filter_list:
-#   - name: remove_whitespace
-#     filter:
-#       - function: remove_whitespace
-#       - function: take_first
 metric_list:
   - metric: !function utils.exact
     aggregation: mean
diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
index 7a6c67db..948ee247 100644
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -3,7 +3,7 @@ group:
 task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
index 8ebd82fb..86c9a54e 100644
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: boolq
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
index a16505fa..d023f7c8 100644
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']
diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
index 47aaf275..227f5d56 100644
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['choice1', 'choice2']
diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
index 008c1443..66eccfef 100644
--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: multirc
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
 doc_to_target: label
 doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
index c1db59ad..22440c0a 100644
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -4,7 +4,7 @@ task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 process_docs: !function t5_utils.process_docs
 doc_to_text: !function t5_utils.doc_to_text
 doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
index 870dc363..df0234d7 100644
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: rte
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'not_entailment']
diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
index da6a9411..3231e41c 100644
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wic
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
index e0ef7538..520cee1a 100644
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
 generation_kwargs:
diff --git a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
index 739fb6c2..ea713393 100644
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'Arabic phrase: {{translation["ar"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
index d3c0462a..891ad50f 100644
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   Arabic phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/utils.py b/lm_eval/tasks/translation/utils.py
index aacc1e96..f80ae89a 100644
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -58,7 +58,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
                 try:
                     source, target = code_to_language(src), code_to_language(tgt)
 
-                    groups = ["greedy_until", "translation", lang]
+                    groups = ["generate_until", "translation", lang]
                     if lang in gpt3_translation_benchmarks.keys():
                         groups += ["gpt3_translation_benchmarks"]
 
diff --git a/lm_eval/tasks/translation/wmt14_en-fr.yaml b/lm_eval/tasks/translation/wmt14_en-fr.yaml
index 154b0698..b7e42dca 100644
--- a/lm_eval/tasks/translation/wmt14_en-fr.yaml
+++ b/lm_eval/tasks/translation/wmt14_en-fr.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   French phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt14_fr-en.yaml b/lm_eval/tasks/translation/wmt14_fr-en.yaml
index 0a414359..09ddd57d 100644
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'French phrase: {{translation["fr"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_de-en.yaml b/lm_eval/tasks/translation/wmt16_de-en.yaml
index b38d21b8..23d50e4a 100644
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'German phrase: {{translation["de"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-de.yaml b/lm_eval/tasks/translation/wmt16_en-de.yaml
index e7ac0d77..8d391b6c 100644
--- a/lm_eval/tasks/translation/wmt16_en-de.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-de.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   German phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-ro.yaml b/lm_eval/tasks/translation/wmt16_en-ro.yaml
index c214b56c..45a8cae1 100644
--- a/lm_eval/tasks/translation/wmt16_en-ro.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-ro.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   Romanian phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_ro-en.yaml b/lm_eval/tasks/translation/wmt16_ro-en.yaml
index 14278794..39441eac 100644
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'Romanian phrase: {{translation["ro"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt_common_yaml b/lm_eval/tasks/translation/wmt_common_yaml
index 5be7c978..2095c1e2 100644
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
@@ -1,4 +1,4 @@
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 fewshot_split: validation
diff --git a/lm_eval/tasks/triviaqa/default.yaml b/lm_eval/tasks/triviaqa/default.yaml
index e0afcec3..67c65acb 100644
--- a/lm_eval/tasks/triviaqa/default.yaml
+++ b/lm_eval/tasks/triviaqa/default.yaml
@@ -1,7 +1,7 @@
 task: triviaqa
 dataset_path: trivia_qa
 dataset_name: rc.nocontext
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "Question: {{question}}?\nAnswer:"
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
index 88412ad1..8d2adeaf 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
@@ -3,7 +3,7 @@ group:
 task: truthfulqa_gen
 dataset_path: truthful_qa
 dataset_name: generation
-output_type: greedy_until
+output_type: generate_until
 training_split: null
 validation_split: validation
 test_split: null
diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml
index c549a07e..b6abf984 100644
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -3,7 +3,7 @@ group:
 task: anagrams1
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_1_anagrams
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml
index 4df34b0c..285d7ced 100644
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -3,7 +3,7 @@ group:
 task: anagrams2
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_2_anagrams
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml
index e84d0c96..602adbad 100644
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -3,7 +3,7 @@ group:
 task: cycle_letters
 dataset_path: EleutherAI/unscramble
 dataset_name: cycle_letters_in_word
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml
index 56f19989..aa4ce86d 100644
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -3,7 +3,7 @@ group:
 task: random_insertion
 dataset_path: EleutherAI/unscramble
 dataset_name: random_insertion_in_word
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/reversed_words.yaml b/lm_eval/tasks/unscramble/reversed_words.yaml
index 97564422..ffef53f6 100644
--- a/lm_eval/tasks/unscramble/reversed_words.yaml
+++ b/lm_eval/tasks/unscramble/reversed_words.yaml
@@ -3,7 +3,7 @@ group:
 task: reversed_words
 dataset_path: EleutherAI/unscramble
 dataset_name: reversed_words
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
index 61d5140d..8d377167 100644
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: wmt16
 dataset_name: ro-en
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "translate English to Romanian: {{translation.en}}"
 doc_to_target: "{{translation.ro}}"
 metric_list:
-- 
GitLab


From a7ba3d76f26b3a581d0de0a19cb134bbd1debd0e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 17 Oct 2023 15:26:49 +0000
Subject: [PATCH 115/212] changed file name

---
 lm_eval/tasks/squadv2/{_template.yaml => _template_yaml} | 0
 lm_eval/tasks/squadv2/default.yaml                       | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lm_eval/tasks/squadv2/{_template.yaml => _template_yaml} (100%)

diff --git a/lm_eval/tasks/squadv2/_template.yaml b/lm_eval/tasks/squadv2/_template_yaml
similarity index 100%
rename from lm_eval/tasks/squadv2/_template.yaml
rename to lm_eval/tasks/squadv2/_template_yaml
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
index 51a304fc..0f42bf54 100644
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
@@ -1,6 +1,6 @@
 include: _template_yaml
 task: squadv2
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   until:
     - "\n"
-- 
GitLab


From 785153f622e681ebfba07494afc768eafd669b7a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 18 Oct 2023 12:51:10 +0000
Subject: [PATCH 116/212] fix greedy_until abstractmethod

---
 lm_eval/api/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 42ef1c74..8ab73cff 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -96,7 +96,7 @@ class LM(abc.ABC):
 
     # TODO: Add an optional max length
     @abc.abstractmethod
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         """Generate greedily until a stopping sequence
 
         :param requests: list[Instance]
-- 
GitLab


From c28d100d00b8cfd07b809bafbcef0dc0d0452d83 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 18 Oct 2023 19:10:08 +0000
Subject: [PATCH 117/212] replace the rest of the 'greedy_until' occurrences

---
 README.md                                 | 16 ++++++++--------
 docs/interface.md                         |  4 ++--
 docs/model_guide.md                       | 20 +++++++++-----------
 docs/task_guide.md                        |  4 ++--
 lm_eval/api/instance.py                   |  2 +-
 lm_eval/api/metrics.py                    |  6 +++---
 lm_eval/api/model.py                      |  4 ++--
 lm_eval/api/registry.py                   |  2 +-
 scripts/cost_estimate.py                  |  2 +-
 tests/models/test_huggingface.py          | 16 ++++++++--------
 tests/tests_master/test_models.py         |  6 +++---
 tests/tests_master/test_version_stable.py |  6 +++---
 12 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 22afe46e..1bcba7f4 100644
--- a/README.md
+++ b/README.md
@@ -155,14 +155,14 @@ A full accounting of the supported and planned libraries + APIs can be seen belo
 
 | API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                    | Request Types:                                           |
 |-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `greedy_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `greedy_until` (no logprobs)                             |
-| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `greedy_until` (no logprobs)                             |
+| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `generate_until` (no logprobs)                             |
+| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `generate_until` (no logprobs)                             |
+| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `generate_until` (no logprobs)                             |
 | Your inference server here! | ...                             | ...                                                                              | ...                                  | ...                                                      |                                | ...                                                      |
 
 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
diff --git a/docs/interface.md b/docs/interface.md
index 860dd1c0..36353e7f 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -57,7 +57,7 @@ import lm_eval
 
 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
 results = lm_eval.simple_evaluate( # call simple_evaluate
     model=lm_obj,
@@ -83,7 +83,7 @@ from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task
 
 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
 
diff --git a/docs/model_guide.md b/docs/model_guide.md
index cf79dd77..a71539b7 100644
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -44,26 +44,24 @@ class MyCustomLM(LM):
         #...
 
 
-    def greedy_until(self, requests: list[Instance]) -> list[str]:
+    def generate_until(self, requests: list[Instance]) -> list[str]:
         #...
     #...
 ```
 Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
 
-We support
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
 
-The three types of
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
 
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  -
 
+- `loglikelihood`
+  -
 
-smth smth tokenizer-agnostic
-
-3 reqtypes
-- greedy_until, and the arguments passed to it
-
-- loglikelihood, and args passed to it
-
-- loglikelihood_rolling, and args passed to it
+- `loglikelihood_rolling`, and args passed to it
 
 
 ## Registration
diff --git a/docs/task_guide.md b/docs/task_guide.md
index 3e15fd9f..5d63c15d 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -32,7 +32,7 @@ Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
-- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `greedy_until` tasks.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
 
@@ -42,7 +42,7 @@ Runtime configuration options:
 
 Scoring details:
 - **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
-- **output_type** (`str`, *optional*, defaults to "greedy_until") — Selects the type of model output for the given task. Options are `greedy_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
 - **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
 - **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
 - **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
diff --git a/lm_eval/api/instance.py b/lm_eval/api/instance.py
index f3e7f005..7d3c23aa 100644
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,7 +4,7 @@ from typing import Literal, Tuple
 
 @dataclass
 class Instance:
-    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
     doc: dict
     arguments: tuple
     idx: int
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 16d9a143..02c0a936 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -212,7 +212,7 @@ def f1_fn(items):  # This is a passthrough function
 @register_metric(
     metric="bleu",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="bleu",
 )
 def bleu_fn(items):  # This is a passthrough function
@@ -222,7 +222,7 @@ def bleu_fn(items):  # This is a passthrough function
 @register_metric(
     metric="chrf",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="chrf",
 )
 def chrf_fn(items):  # This is a passthrough function
@@ -232,7 +232,7 @@ def chrf_fn(items):  # This is a passthrough function
 @register_metric(
     metric="ter",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="ter",
 )
 def ter_fn(items):  # This is a passthrough function
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 8ab73cff..c24026ac 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -211,12 +211,12 @@ class CachingLM:
             )
             for req in tqdm(requests):
                 hsh = hash_args(attr, req.args)
-                if attr == "greedy_until" and req.args[1].get("do_sample", False):
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
                     # when we are doing non-greedy generation, don't use the cache
                     # (else every "randomly sampled" generation would be identical for repeats > 1).
                     if not warned:
                         eval_logger.warning(
-                            f"Arguments to lm.greedy_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
                         )
                         warned = True
                     res.append(None)
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 53e5771a..f227a30b 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -81,7 +81,7 @@ DEFAULT_METRIC_REGISTRY = {
     ],
     "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
     "multiple_choice": ["acc", "acc_norm"],
-    "greedy_until": ["exact_match"],
+    "generate_until": ["exact_match"],
 }
 
 
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
index e8e0c35b..72b8d4b3 100644
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -23,7 +23,7 @@ class DryrunLM(LM):
 
         return res
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = []
 
         for ctx, _ in requests:
diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 0ecd02aa..1fd9464a 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -15,10 +15,10 @@ class Test_HFLM:
     multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
     multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
     MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
-    greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
-    greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
-    GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until: list[Instance] = generate_until_task.instances
     rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
     rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
     ROLLING: list[Instance] = rolling_task.instances
@@ -65,7 +65,7 @@ class Test_HFLM:
         -52.70050811767578,
         -56.25089645385742,
     ]
-    GREEDY_UNTIL_RES = [
+    generate_until_RES = [
         " The average of $2.50 each is $",
         " A robe takes 2 bolts of blue fiber and half",
         " $50,000 in repairs.",
@@ -109,9 +109,9 @@ class Test_HFLM:
         ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
         assert (argmax_RES == argmax_res).all()
 
-    def test_greedy_until(self) -> None:
-        res = self.LM.greedy_until(self.GREEDY_UNTIL)
-        assert res == self.GREEDY_UNTIL_RES
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert res == self.generate_until_RES
 
     def test_logliklihood_rolling(self) -> None:
         res = self.LM.loglikelihood_rolling(self.ROLLING)
diff --git a/tests/tests_master/test_models.py b/tests/tests_master/test_models.py
index c50332da..b3f6b1b8 100644
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -78,7 +78,7 @@ def test_gpt2():
     # test empty context
     gpt2.loglikelihood([("", "test")])
 
-    (gen,) = gpt2.greedy_until(
+    (gen,) = gpt2.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
@@ -204,7 +204,7 @@ def test_gpt3():
     # test empty context
     gpt3.loglikelihood([("", "test")])
 
-    (gen,) = gpt3.greedy_until(
+    (gen,) = gpt3.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
@@ -300,7 +300,7 @@ def test_textsynth():
     # test empty context
     textsynth.loglikelihood([("", "test")])
 
-    (gen,) = textsynth.greedy_until(
+    (gen,) = textsynth.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
diff --git a/tests/tests_master/test_version_stable.py b/tests/tests_master/test_version_stable.py
index 3d639122..2eba83c6 100644
--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
@@ -98,9 +98,9 @@ def test_versions_stable(taskname, task_class):
 
         return res
 
-    def greedy_until(reqs):
+    def generate_until(reqs):
         res = []
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-generate_until", reqs)
 
         for ctx, _ in [req.args for req in reqs]:
             res.append("lol")
@@ -110,7 +110,7 @@ def test_versions_stable(taskname, task_class):
 
     lm.loglikelihood = ll_fn
     lm.loglikelihood_rolling = ll_perp_fn
-    lm.greedy_until = greedy_until
+    lm.generate_until = generate_until
 
     limit = None
     result = evaluator.evaluate(
-- 
GitLab


From e66ba123cc9e16e50e90c6505737728e92f8db00 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 18 Oct 2023 19:21:02 +0000
Subject: [PATCH 118/212] don't raise error for unset higher_is_better

---
 lm_eval/api/registry.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index f227a30b..3601835e 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -171,7 +171,6 @@ def is_higher_better(metric_name):
     try:
         return HIGHER_IS_BETTER_REGISTRY[metric_name]
     except KeyError:
-        raise Warning(f"higher_is_better not specified for metric '{metric_name}'!")
         eval_logger.warning(
             f"higher_is_better not specified for metric '{metric_name}'!"
         )
-- 
GitLab


From a8d130abab6623fca6dc53b8d2af94491836d0db Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 18 Oct 2023 23:03:07 -0400
Subject: [PATCH 119/212] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 22afe46e..4a7f3508 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ python -m lm_eval \
     --batch_size 8
 ```
 
-Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) or `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported via  Support for this model type is currently pending.
+Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supporteded.
 
 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
 
-- 
GitLab


From a007bacb1941356233b0aba49dbcea9bec10429e Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Thu, 19 Oct 2023 15:50:51 +0800
Subject: [PATCH 120/212] fix two bugs when ran with qasper_bool and toxigen

---
 lm_eval/__main__.py       | 29 +++++++++++++++++++++++------
 lm_eval/tasks/__init__.py |  3 ++-
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index c877262e..8d30f878 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -2,11 +2,10 @@ import os
 import re
 import json
 import fnmatch
-import jsonlines
 import argparse
 import logging
 from pathlib import Path
-
+import numpy as np
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
@@ -15,6 +14,14 @@ from lm_eval.tasks import include_path
 from typing import Union
 
 
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    raise TypeError(f"Object of type {o.__class__.__name__} is not JSON serializable")
+
+
 def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
@@ -103,6 +110,12 @@ def parse_eval_args() -> argparse.Namespace:
         default="INFO",
         help="Log error when tasks are not registered.",
     )
+    parser.add_argument(
+        "--huggingface_token",
+        type=str,
+        default=None,
+        help="huggingface token for downloading some authorization datasets, like toxigen, https://huggingface.co/settings/tokens",
+    )
     return parser.parse_args()
 
 
@@ -119,7 +132,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             " --limit SHOULD ONLY BE USED FOR TESTING."
             "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
+    if args.huggingface_token:
+        from huggingface_hub import login
 
+        login(token=args.huggingface_token)
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
         include_path(args.include_path)
@@ -195,7 +211,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
     if results is not None:
         if args.log_samples:
             samples = results.pop("samples")
-        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
+        dumped = json.dumps(results, indent=2, default=_handle_non_serializable)
         if args.show_config:
             print(dumped)
 
@@ -210,9 +226,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                         re.sub("/|=", "__", args.model_args), task_name
                     )
                     filename = path.joinpath(f"{output_name}.jsonl")
-
-                    with jsonlines.open(filename, "w") as f:
-                        f.write_all(samples[task_name])
+                    samples_dumped = json.dumps(
+                        samples[task_name], indent=2, default=_handle_non_serializable
+                    )
+                    filename.open("w").write(samples_dumped)
 
         print(
             f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index eb9d3353..d36b6077 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,7 +15,8 @@ from lm_eval.api.registry import (
 
 import logging
 
-eval_logger = logging.getLogger('lm-eval')
+eval_logger = logging.getLogger("lm-eval")
+
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
-- 
GitLab


From 0c5de3ac58e888be9d39e3ea45a55e6ca51a7b59 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Thu, 19 Oct 2023 17:59:29 +0800
Subject: [PATCH 121/212] fix :Object of type int32 is not JSON serializable

---
 lm_eval/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 8d30f878..d7b5fd3d 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -15,7 +15,7 @@ from typing import Union
 
 
 def _handle_non_serializable(o):
-    if isinstance(o, np.int64):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
         return int(o)
     elif isinstance(o, set):
         return list(o)
-- 
GitLab


From a6179b494cb835e0dbdc4c19fa42055d652ef8fc Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 19 Oct 2023 21:06:39 +0700
Subject: [PATCH 122/212] Update registry.py

Removed `get_default_aggregation`
---
 lm_eval/api/registry.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 3601835e..a946c9b2 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -158,15 +158,6 @@ def get_aggregation(name):
         )
 
 
-def get_default_aggregation(metric_name):
-    try:
-        return DEFAULT_AGGREGATION_REGISTRY[metric_name]
-    except KeyError:
-        eval_logger.warning(
-            f"No default aggregation metric for metric '{metric_name}'!"
-        )
-
-
 def is_higher_better(metric_name):
     try:
         return HIGHER_IS_BETTER_REGISTRY[metric_name]
-- 
GitLab


From 6b4161c17bde2b063c59769b9f1378a305059044 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 19 Oct 2023 21:08:28 +0700
Subject: [PATCH 123/212] Changed `get_default_aggregation` to
 `get_aggregation`

---
 lm_eval/api/task.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 32813dec..375c967d 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -33,7 +33,6 @@ from lm_eval.api.metrics import (
 from lm_eval.api.registry import (
     get_metric,
     get_aggregation,
-    get_default_aggregation,
     is_higher_better,
     DEFAULT_METRIC_REGISTRY,
     OUTPUT_TYPE_REGISTRY,
@@ -543,9 +542,7 @@ class ConfigurableTask(Task):
             # TODO: handle this in TaskConfig.__post_init__ ?
             for metric_name in _metric_list:
                 self._metric_fn_list[metric_name] = get_metric(metric_name)
-                self._aggregation_list[metric_name] = get_default_aggregation(
-                    metric_name
-                )
+                self._aggregation_list[metric_name] = get_aggregation(metric_name)
                 self._higher_is_better[metric_name] = is_higher_better(metric_name)
         else:
             for metric_config in self.config.metric_list:
-- 
GitLab


From 90d818daa915199d32e833975d60671ff4b5b451 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 19 Oct 2023 14:42:02 +0000
Subject: [PATCH 124/212] fix issue with default metrics and aggregation
 functions

---
 lm_eval/api/metrics.py        | 10 ++++++++++
 lm_eval/api/registry.py       | 19 ++++++++++++++-----
 lm_eval/api/task.py           |  8 ++++++--
 lm_eval/models/huggingface.py | 22 ++++++++++++++++++----
 lm_eval/tasks/__init__.py     |  3 ++-
 5 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 02c0a936..bc4b7f00 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -135,6 +135,16 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     return items
 
 
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
     metric="perplexity",
     higher_is_better=False,
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index a946c9b2..aebe8c04 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -68,10 +68,10 @@ def register_group(name):
     return decorate
 
 
-AGGREGATION_REGISTRY = {}
-DEFAULT_AGGREGATION_REGISTRY = {}
-METRIC_REGISTRY = {}
 OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY = {}
 HIGHER_IS_BETTER_REGISTRY = {}
 
 DEFAULT_METRIC_REGISTRY = {
@@ -95,8 +95,7 @@ def register_metric(**args):
         for key, registry in [
             ("metric", METRIC_REGISTRY),
             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
-            # ("output_type", OUTPUT_TYPE_REGISTRY),
-            ("aggregation", DEFAULT_AGGREGATION_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
         ]:
 
             if key in args:
@@ -158,6 +157,16 @@ def get_aggregation(name):
         )
 
 
+def get_metric_aggregation(name):
+
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(
+            "{} not a registered aggregation metric!".format(name),
+        )
+
+
 def is_higher_better(metric_name):
     try:
         return HIGHER_IS_BETTER_REGISTRY[metric_name]
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 375c967d..6067e159 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -33,6 +33,7 @@ from lm_eval.api.metrics import (
 from lm_eval.api.registry import (
     get_metric,
     get_aggregation,
+    get_metric_aggregation,
     is_higher_better,
     DEFAULT_METRIC_REGISTRY,
     OUTPUT_TYPE_REGISTRY,
@@ -537,12 +538,15 @@ class ConfigurableTask(Task):
         self._aggregation_list = {}
         self._higher_is_better = {}
 
-        _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
         if self.config.metric_list is None:
             # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+
             for metric_name in _metric_list:
                 self._metric_fn_list[metric_name] = get_metric(metric_name)
-                self._aggregation_list[metric_name] = get_aggregation(metric_name)
+                self._aggregation_list[metric_name] = get_metric_aggregation(
+                    metric_name
+                )
                 self._higher_is_better[metric_name] = is_higher_better(metric_name)
         else:
             for metric_config in self.config.metric_list:
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index dfc88c77..e24d9567 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -663,8 +663,16 @@ class HFLM(LM):
 
         chunks = utils.chunks(
             re_ord.get_reordered(),
-            n=self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0,
-            fn=self._batch_scheduler if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs else None,
+            n=self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0,
+            fn=self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None,
         )
 
         for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
@@ -840,8 +848,14 @@ class HFLM(LM):
         for key, re_ord in re_ords.items():
             chunks = utils.chunks(
                 re_ord.get_reordered(),
-                n=self.batch_size if self.batch_size != "auto" else adaptive_batch_size if adaptive_batch_size is not None else 0,
-                fn=self._batch_scheduler if self.batch_size == "auto" and not adaptive_batch_size else None,
+                n=self.batch_size
+                if self.batch_size != "auto"
+                else adaptive_batch_size
+                if adaptive_batch_size is not None
+                else 0,
+                fn=self._batch_scheduler
+                if self.batch_size == "auto" and not adaptive_batch_size
+                else None,
             )
             for chunk in tqdm(chunks, disable=self.rank != 0):
                 contexts, all_gen_kwargs = zip(*chunk)
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index eb9d3353..d36b6077 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,7 +15,8 @@ from lm_eval.api.registry import (
 
 import logging
 
-eval_logger = logging.getLogger('lm-eval')
+eval_logger = logging.getLogger("lm-eval")
+
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
-- 
GitLab


From 9fbe6eef799213340d9871f23c853dcf8a154d43 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 19 Oct 2023 14:54:30 +0000
Subject: [PATCH 125/212] changed warning message

---
 lm_eval/api/registry.py | 2 +-
 lm_eval/api/task.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index aebe8c04..4e78048b 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -163,7 +163,7 @@ def get_metric_aggregation(name):
         return METRIC_AGGREGATION_REGISTRY[name]
     except KeyError:
         eval_logger.warning(
-            "{} not a registered aggregation metric!".format(name),
+            "{} metric is not assigned a default aggregation!".format(name),
         )
 
 
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 6067e159..60b5ccbf 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -587,7 +587,7 @@ class ConfigurableTask(Task):
                         ]
                 else:
                     INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
-                    metric_agg = get_default_aggregation(metric_name)
+                    metric_agg = get_metric_aggregation(metric_name)
                     eval_logger.warning(
                         f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
                         f"using default "
-- 
GitLab


From 8d4d1fa9252539951e2b3b1a9d347d96484a5cd0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 19 Oct 2023 15:24:45 +0000
Subject: [PATCH 126/212] fixed registered metric

---
 lm_eval/api/metrics.py | 5 +++--
 lm_eval/api/task.py    | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index bc4b7f00..a6d3abc3 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -5,6 +5,7 @@ import numpy as np
 import sacrebleu
 import sklearn.metrics
 import random
+import evaluate
 
 from lm_eval.api.registry import register_metric, register_aggregation
 
@@ -141,8 +142,8 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     output_type="generate_until",
     aggregation="mean",
 )
-def exact_match_fn(items):  # This is a passthrough function
-    return items
+def exact_match_fn(**kwargs):  # This is a passthrough function
+    return evaluate.load("exact_match").compute(**kwargs)
 
 
 @register_metric(
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 60b5ccbf..ba303db0 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -544,6 +544,7 @@ class ConfigurableTask(Task):
 
             for metric_name in _metric_list:
                 self._metric_fn_list[metric_name] = get_metric(metric_name)
+                self._metric_fn_kwargs[metric_name] = {}
                 self._aggregation_list[metric_name] = get_metric_aggregation(
                     metric_name
                 )
-- 
GitLab


From ff148cd862f51617724cf1f2c48dc83c47df980c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 19 Oct 2023 23:05:34 +0000
Subject: [PATCH 127/212] don't call evaluate.load() every time

---
 lm_eval/api/metrics.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index a6d3abc3..69e66fdc 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -136,14 +136,17 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     return items
 
 
+exact_match = evaluate.load("exact_match")
+
+
 @register_metric(
     metric="exact_match",
     higher_is_better=True,
     output_type="generate_until",
     aggregation="mean",
 )
-def exact_match_fn(**kwargs):  # This is a passthrough function
-    return evaluate.load("exact_match").compute(**kwargs)
+def exact_match_fn(**kwargs):
+    return exact_match.compute(**kwargs)
 
 
 @register_metric(
-- 
GitLab


From d9b4ca7f93e1410e3910e254d94b797df62f0eff Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 19 Oct 2023 23:06:10 +0000
Subject: [PATCH 128/212] fix bug when target_delimiter is empty string

---
 lm_eval/api/task.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index ba303db0..898909e4 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -689,7 +689,10 @@ class ConfigurableTask(Task):
             for choice in check_choices:
                 choice_has_whitespace = True if choice[0].isspace() else False
                 delimiter_has_whitespace = (
-                    True if self.config.target_delimiter[-1].isspace() else False
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    == self.config.target_delimiter
+                    else False
                 )
 
                 if delimiter_has_whitespace and choice_has_whitespace:
-- 
GitLab


From 1428ad575ffd701be935e3fc1833d525334091ff Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 19 Oct 2023 23:06:54 +0000
Subject: [PATCH 129/212] fixes to pythia benchmark, boolqseq2seq

---
 lm_eval/tasks/benchmarks/pythia.yaml        | 2 +-
 lm_eval/tasks/super_glue/boolq/seq2seq.yaml | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/benchmarks/pythia.yaml b/lm_eval/tasks/benchmarks/pythia.yaml
index bb5b1174..bdeadd3c 100644
--- a/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/lm_eval/tasks/benchmarks/pythia.yaml
@@ -9,4 +9,4 @@ task:
   - wsc
   - ai2_arc
   - blimp
-  - hendrycksTest*
+  - mmlu
diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
index 948ee247..b1c0048f 100644
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -8,7 +8,8 @@ training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
 doc_to_target: label
-doc_to_choice: ['no', 'yes']
+doc_to_choice: [' no', ' yes']
+target_delimiter: ""
 generation_kwargs:
   until:
     - "\n\n"
-- 
GitLab


From bff237e6badc292f8f87af11f679a822163d5b5f Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Fri, 20 Oct 2023 15:12:50 +0800
Subject: [PATCH 130/212] change huggingface_login from str to bool for privacy

---
 lm_eval/__main__.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index d7b5fd3d..2155e6f3 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -111,9 +111,9 @@ def parse_eval_args() -> argparse.Namespace:
         help="Log error when tasks are not registered.",
     )
     parser.add_argument(
-        "--huggingface_token",
-        type=str,
-        default=None,
+        "--huggingface_login",
+        action="store_true",
+        default=False,
         help="huggingface token for downloading some authorization datasets, like toxigen, https://huggingface.co/settings/tokens",
     )
     return parser.parse_args()
@@ -132,10 +132,14 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             " --limit SHOULD ONLY BE USED FOR TESTING."
             "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
-    if args.huggingface_token:
+    if args.huggingface_login:
         from huggingface_hub import login
 
-        login(token=args.huggingface_token)
+        assert (
+            "HUGGINGFACE_LOGIN_TOKEN" in os.environ
+        ), "Your environment variable does not contain a HUGGINGFACE_LOGIN_TOKEN. Please set the token first."
+        huggingface_token = os.environ["HUGGINGFACE_LOGIN_TOKEN"]
+        login(token=huggingface_token)
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
         include_path(args.include_path)
-- 
GitLab


From 305b546084a9ac7766565a8c7228f523c38efb90 Mon Sep 17 00:00:00 2001
From: Zhiwei Zhuang <zhiwei.zhuang@shopee.com>
Date: Fri, 20 Oct 2023 18:00:03 +0800
Subject: [PATCH 131/212] Force conversion to str if type exceeds expected

---
 lm_eval/__main__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 2155e6f3..f57d712c 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -19,7 +19,9 @@ def _handle_non_serializable(o):
         return int(o)
     elif isinstance(o, set):
         return list(o)
-    raise TypeError(f"Object of type {o.__class__.__name__} is not JSON serializable")
+    else:
+        print(f"Object of type {o.__class__.__name__} is not JSON serializable,just stringify it")
+        return str(o)
 
 
 def parse_eval_args() -> argparse.Namespace:
@@ -114,7 +116,7 @@ def parse_eval_args() -> argparse.Namespace:
         "--huggingface_login",
         action="store_true",
         default=False,
-        help="huggingface token for downloading some authorization datasets, like toxigen, https://huggingface.co/settings/tokens",
+        help="huggingface token for downloading some authorization datasets, like toxigen, you need add HUGGINGFACE_LOGIN_TOKEN to environment variable firstly. https://huggingface.co/settings/tokens",
     )
     return parser.parse_args()
 
-- 
GitLab


From 1964ccd302215f59828a7b328ba78a61e577b138 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Fri, 20 Oct 2023 18:42:45 +0200
Subject: [PATCH 132/212] feat: add --include_path option to write out

---
 scripts/write_out.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/scripts/write_out.py b/scripts/write_out.py
index df39bd26..d6b1f012 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -2,9 +2,9 @@ import argparse
 import numpy as np
 import json
 import os
-import random
-from lm_eval import tasks
 from lm_eval.utils import join_iters
+from lm_eval.tasks import include_path
+from lm_eval.logger import eval_logger
 
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 
@@ -17,6 +17,12 @@ def parse_args():
     parser.add_argument("--num_fewshot", type=int, default=1)
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        help="Additional path to include if there are external tasks to include.",
+    )
     return parser.parse_args()
 
 
@@ -24,13 +30,17 @@ def main():
     args = parse_args()
     np.random.seed(args.seed)
 
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+        include_path(args.include_path)
+
     if args.tasks == "all_tasks":
         task_names = tasks.ALL_TASKS
     else:
         task_names = args.tasks.split(",")
     task_dict = tasks.get_task_dict(task_names)
-
-    os.makedirs(args.output_base_path, exist_ok=True)
+    
+          os.makedirs(args.output_base_path, exist_ok=True)
     for task_name, task in task_dict.items():
         rnd = random.Random()
         rnd.seed(args.seed)
-- 
GitLab


From 1a183d0374ca4df9d2715ad5f7cdb41a27264170 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Fri, 20 Oct 2023 18:45:36 +0200
Subject: [PATCH 133/212] fix: copy paste mistake

---
 scripts/write_out.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/write_out.py b/scripts/write_out.py
index d6b1f012..cc15ad33 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -2,6 +2,8 @@ import argparse
 import numpy as np
 import json
 import os
+import random
+from lm_eval import tasks
 from lm_eval.utils import join_iters
 from lm_eval.tasks import include_path
 from lm_eval.logger import eval_logger
@@ -39,8 +41,8 @@ def main():
     else:
         task_names = args.tasks.split(",")
     task_dict = tasks.get_task_dict(task_names)
-    
-          os.makedirs(args.output_base_path, exist_ok=True)
+
+    os.makedirs(args.output_base_path, exist_ok=True)
     for task_name, task in task_dict.items():
         rnd = random.Random()
         rnd.seed(args.seed)
-- 
GitLab


From b3a2af333f9597ec74eab79b005c85d9695c6cb1 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:04:59 -0400
Subject: [PATCH 134/212] Update pyproject.toml

---
 pyproject.toml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0689c34c..45ccd3c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,9 +41,13 @@ dependencies = [
 [tool.setuptools]
 packages = ["lm_eval"]
 
+
+[tool.setuptools.packages.find]
+include = ["lm_eval*"]
+
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-lm_eval = ["**/*.yaml", "api/**/*", "decontamination/**/*", "filters/**/*", "models/**/*", "prompts/**/*", "tasks/**/*"]
+lm_eval = ["**/*.yaml", "prompts/**/*", "tasks/**/*"]
 examples = ["**/*.yaml"]
 
 [project.scripts]
-- 
GitLab


From e7e3adeb3a0f047caba90fb3fbdb14e3a801ee2d Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:06:36 -0400
Subject: [PATCH 135/212] Update pyproject.toml

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 45ccd3c2..8b0eaecd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,6 @@ dependencies = [
     "zstandard",
 ]
 
-[tool.setuptools]
-packages = ["lm_eval"]
-
-
 [tool.setuptools.packages.find]
 include = ["lm_eval*"]
 
-- 
GitLab


From fa1f35718631060e6057b9f2cd3c82d77593c8ba Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:10:28 -0400
Subject: [PATCH 136/212] Update pyproject.toml

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8b0eaecd..f495a57f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,9 @@ dependencies = [
 ]
 
 [tool.setuptools.packages.find]
-include = ["lm_eval*"]
+where = ["lm_eval"]
+include = ["*"]
+namespaces = false
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-- 
GitLab


From a1b589df8e8a6eff383459bf01166a54d3decbe4 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:45:07 -0400
Subject: [PATCH 137/212] Update pyproject.toml

---
 pyproject.toml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f495a57f..7a9b8d6e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,14 +39,11 @@ dependencies = [
 ]
 
 [tool.setuptools.packages.find]
-where = ["lm_eval"]
-include = ["*"]
-namespaces = false
+include = ["lm_eval"]
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-lm_eval = ["**/*.yaml", "prompts/**/*", "tasks/**/*"]
-examples = ["**/*.yaml"]
+lm_eval = ["**/*.yaml", "tasks/**/*"]
 
 [project.scripts]
 lm-eval = "lm_eval.__main__:cli_evaluate"
-- 
GitLab


From 84db6359e1079f9ba90dcd54abcff908ee481ec6 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:50:33 -0400
Subject: [PATCH 138/212] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7a9b8d6e..fd9fc79b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
 ]
 
 [tool.setuptools.packages.find]
-include = ["lm_eval"]
+include = ["lm_eval*"]
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
-- 
GitLab


From 888d30350ac7b20077bc7a540ee1e82a9c2b1220 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 31 Oct 2023 19:31:24 +0000
Subject: [PATCH 139/212] fix warning for delimiter having no whitespace

---
 lm_eval/api/task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 898909e4..883c643d 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -691,7 +691,7 @@ class ConfigurableTask(Task):
                 delimiter_has_whitespace = (
                     True
                     if self.config.target_delimiter.rstrip()
-                    == self.config.target_delimiter
+                    != self.config.target_delimiter
                     else False
                 )
 
@@ -701,7 +701,7 @@ class ConfigurableTask(Task):
                     )
                 elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
                     eval_logger.warning(
-                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                     )
 
     def download(self, dataset_kwargs=None) -> None:
-- 
GitLab


From dd7002d6c1324a79a7e97ec02211e1e3f808dbe4 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 1 Nov 2023 13:34:10 +0000
Subject: [PATCH 140/212] add task and group alias

---
 lm_eval/api/task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 898909e4..3d8052be 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -52,7 +52,9 @@ ALL_OUTPUT_TYPES = [
 class TaskConfig(dict):
     # task naming/registry
     task: str = None
+    task_alias: str = None
     group: Union[str, list] = None
+    group_alias: Union[str, list] = None
     # HF dataset options.
     # which dataset to use,
     # and what splits for what purpose
-- 
GitLab


From f5bdefe84a07fb2db1fb3593d005c411eb21fcdf Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 1 Nov 2023 15:00:48 +0000
Subject: [PATCH 141/212] new way to display tasks

---
 lm_eval/evaluator.py | 54 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index caf84941..27612647 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -221,6 +221,7 @@ def evaluate(
     task_hierarchy = collections.defaultdict(list)
     # store the ordering of tasks and groups
     task_order = collections.defaultdict(int)
+    task_group_alias = collections.defaultdict(dict)
 
     # get lists of each type of request
     for task_name, task in task_dict.items():
@@ -228,6 +229,10 @@ def evaluate(
             group_name, task = task
             task_hierarchy[group_name].append(task_name)
             versions[group_name] = "N/A"
+
+            if "group_alias" in configs[task_name]:
+                task_group_alias[group_name] = configs[task_name]["group_alias"]
+
         else:
             task_hierarchy[task_name] = []
 
@@ -237,6 +242,9 @@ def evaluate(
         versions[task_name] = task.VERSION
         configs[task_name] = dict(task.dump_config())
 
+        if "task_alias" in configs[task_name]:
+            task_group_alias[task_name] = configs[task_name]["task_alias"]
+
         if limit is not None:
             if task.has_test_docs():
                 task_docs = task.test_docs()
@@ -522,19 +530,19 @@ def evaluate(
 
                 results[group]["samples"] = total_size
 
-        def print_tasks(task_hierarchy, task_order, task_version):
+        def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
 
             results_agg = collections.defaultdict(dict)
             groups_agg = collections.defaultdict(dict)
             for group_name, task_list in task_hierarchy.items():
 
                 order = task_order[group_name]
-                tabbed_name = "-" * order + group_name
-                results_agg[tabbed_name] = results[group_name]
-                task_version[tabbed_name] = task_version[group_name]
+                results_agg[group_name] = results[group_name]
+                results_agg[group_name]["tab"] = order
 
                 if (order < max(task_order.values())) and (len(task_list) > 0):
-                    groups_agg[tabbed_name] = results[group_name]
+                    groups_agg[group_name] = results[group_name]
+                    groups_agg[group_name]["tab"] = order
 
                 if task_list != []:
                     for task in sorted(task_list):
@@ -544,7 +552,7 @@ def evaluate(
                             _task_hierarchy = {task: []}
 
                         _results_agg, _groups_agg, task_version = print_tasks(
-                            _task_hierarchy, task_order, task_version
+                            _task_hierarchy, task_order, task_version, task_group_alias
                         )
 
                         results_agg = {**results_agg, **_results_agg}
@@ -553,9 +561,41 @@ def evaluate(
             return results_agg, groups_agg, task_version
 
         results_agg, groups_agg, versions = print_tasks(
-            task_hierarchy, task_order, versions
+            task_hierarchy, task_order, versions, task_group_alias
         )
 
+        _results_agg = collections.defaultdict(dict)
+        _versions = collections.defaultdict(dict)
+        for task in results_agg:
+            task_results = results_agg[task]
+            if "tab" in task_results:
+                tab = task_results.pop("tab")
+                tab_string = " "*(tab-1)+"-" if tab > 0 else ""
+
+            if task in task_group_alias:
+                task_alias = task_group_alias[task]
+                _results_agg[tab_string+task_alias] = task_results
+                _versions[tab_string+task_alias] = versions[task]
+            else:
+                _results_agg[tab_string+task] = task_results
+                _versions[tab_string+task] = versions[task]
+        results_agg = _results_agg
+        versions = _versions
+
+        _groups_agg = collections.defaultdict(dict)
+        for group in groups_agg:
+            group_results = groups_agg[group]
+            if "tab" in group_results:
+                tab = group_results.pop("tab")
+                tab_string = " "*(tab-1)+"-" if tab > 0 else ""
+
+            if group in task_group_alias:
+                group_alias = task_group_alias[group]
+                _groups_agg[tab_string+group_alias] = group_results
+            else:
+                _groups_agg[tab_string+group] = group_results
+        groups_agg = _groups_agg
+
         results_dict = {
             "results": dict(results_agg.items()),
             **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
-- 
GitLab


From 60dd33c80f61373455de89d1a54101144849ba19 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 1 Nov 2023 15:02:15 +0000
Subject: [PATCH 142/212] new way to display tasks

---
 lm_eval/evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 27612647..81f4efd7 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -570,7 +570,7 @@ def evaluate(
             task_results = results_agg[task]
             if "tab" in task_results:
                 tab = task_results.pop("tab")
-                tab_string = " "*(tab-1)+"-" if tab > 0 else ""
+                tab_string = " "*tab+"-" if tab > 0 else ""
 
             if task in task_group_alias:
                 task_alias = task_group_alias[task]
@@ -587,7 +587,7 @@ def evaluate(
             group_results = groups_agg[group]
             if "tab" in group_results:
                 tab = group_results.pop("tab")
-                tab_string = " "*(tab-1)+"-" if tab > 0 else ""
+                tab_string = " "*tab+"-" if tab > 0 else ""
 
             if group in task_group_alias:
                 group_alias = task_group_alias[group]
-- 
GitLab


From 55407cd689fbd424ae3658bbe9ecebd2adedc5af Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 1 Nov 2023 15:43:13 +0000
Subject: [PATCH 143/212] fix tqdm total  for hf model

---
 lm_eval/models/huggingface.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index e24d9567..54cf98a4 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -675,7 +675,10 @@ class HFLM(LM):
             else None,
         )
 
-        for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
+        for (
+            chunk
+        ) in chunks:  # tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
             inps = []
             cont_toks_list = []
             inplens = []
@@ -812,6 +815,9 @@ class HFLM(LM):
                 res.append(answer)
 
                 self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+
+        pbar.close()
 
         return re_ord.get_original(res)
 
-- 
GitLab


From 2fda31ca0490a010131d8d676f4003f5132d6ae3 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 1 Nov 2023 16:54:12 +0000
Subject: [PATCH 144/212] remove nonstandard logger from evaluator.py

---
 lm_eval/evaluator.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index bf35097c..177eab31 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -2,7 +2,6 @@ import random
 import itertools
 import json
 import collections
-import logging
 import sys
 
 import torch
@@ -25,10 +24,6 @@ from lm_eval.utils import (
 
 from lm_eval.logger import eval_logger
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stdout))
-
 
 @positional_deprecated
 def simple_evaluate(
-- 
GitLab


From a922db886c1fa0c0b8e8a5c40e6b8a12a44c1546 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 1 Nov 2023 16:55:03 +0000
Subject: [PATCH 145/212] rename template for generate_until bb tasks

---
 .../{greedy_until_template_yaml => generate_until_template_yaml}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lm_eval/tasks/bigbench/{greedy_until_template_yaml => generate_until_template_yaml} (100%)

diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml
similarity index 100%
rename from lm_eval/tasks/bigbench/greedy_until_template_yaml
rename to lm_eval/tasks/bigbench/generate_until_template_yaml
-- 
GitLab


From f5cf38a9b730c81c39d35483de4a20c473d4ac3f Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 1 Nov 2023 17:05:06 +0000
Subject: [PATCH 146/212] replace with no_train mmlu dataset

---
 lm_eval/tasks/mmlu/default/_default_template_yaml               | 2 +-
 .../mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml  | 2 +-
 .../mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml  | 2 +-
 .../tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml  | 2 +-
 .../mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index bd989c40..af4bf12c 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
 fewshot_config:
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index 4b54fb41..e340271a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu_flan_cot_fewshot
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 fewshot_split: dev
 output_type: generate_until
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
index 37c95ce7..1e276204 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu_flan_cot_zeroshot
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 fewshot_split: dev
 output_type: generate_until
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
index 49046d22..93dc8c71 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu_flan_n_shot_generative
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
 output_type: generate_until
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
index 5db2981a..3efc2e42 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu_flan_n_shot_loglikelihood
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
 output_type: multiple_choice
-- 
GitLab


From 7fa0761cb5514158c94ab33cb20161b50f03e3c2 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 1 Nov 2023 18:00:42 +0000
Subject: [PATCH 147/212] clean up documentation

---
 docs/model_guide.md    | 33 +++++++++++++++++++++++++++++----
 docs/new_task_guide.md |  6 +++---
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/docs/model_guide.md b/docs/model_guide.md
index a71539b7..10c58e06 100644
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -48,7 +48,7 @@ class MyCustomLM(LM):
         #...
     #...
 ```
-Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
 
 We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
 
@@ -56,14 +56,37 @@ All three request types take as input `requests` of type `list[Instance]` that h
 
 - `generate_until`
   - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
-  -
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated input+output text from the model will then be returned.
 
 - `loglikelihood`
-  -
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )
 
-- `loglikelihood_rolling`, and args passed to it
+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
 
 
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` !
+
+**Tip: be careful of indexing in loglikelihood!**
+
+
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+
+```
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```
+
+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .
+
 ## Registration
 
 Congrats on implementing your model! Now it's time to test it out.
@@ -81,7 +104,9 @@ class MyCustomLM(LM):
 
 Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!
 
+## Testing
 
+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
 
 ## Other
 
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index e0ccc6a3..d0b2b429 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -17,7 +17,7 @@ git checkout -b <task-name>
 pip install -e ".[dev]"
 ```
 
-As a concrete example, we'll walk through reimplementing the `gsm8k` benchmark (a *generative* task which requires sampling text from a model) and the `sciq` benchmark. (a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices).
+In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/sciq/sciq.yaml).
 
 ## Creating a YAML file
 
@@ -116,7 +116,7 @@ doc_to_choice: ['No', 'Yes']
 
 We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.
 
-Take for example `super_glue/boolq`, as input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
+Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
 ```
 doc["passage"]
 Question: doc["question"]?
@@ -285,7 +285,7 @@ It's now time to check models' performance on your task! In the evaluation harne
 
 To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented.
 
-### Task impl. checklist
+### Task Validity Checklist
 
 The checklist is the following:
 
-- 
GitLab


From 09a744b9ea1823a62107df87cd53e048f9a39c00 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 1 Nov 2023 16:14:09 -0400
Subject: [PATCH 148/212] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbd8777b..2cea64c1 100644
--- a/README.md
+++ b/README.md
@@ -249,7 +249,7 @@ You can also ask for help, or discuss new features with the maintainers in the #
 ## Cite as
 
 ```
-@software{eval-harness,
+@misc{eval-harness,
   author       = {Gao, Leo and
                   Tow, Jonathan and
                   Abbasi, Baber and
-- 
GitLab


From 7aad26fe2f3201eccc0a24cb5b0a05cd40d8fcc4 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 1 Nov 2023 16:16:11 -0400
Subject: [PATCH 149/212] Update README.md

---
 README.md | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 2cea64c1..2622a62d 100644
--- a/README.md
+++ b/README.md
@@ -250,30 +250,7 @@ You can also ask for help, or discuss new features with the maintainers in the #
 
 ```
 @misc{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Abbasi, Baber and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  Le Noac'h, Alain and
-                  Li, Haonan and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Ociepa, Chris
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Schoelkopf, Hailey and
-                  Skowron, Aviya and
-                  Sutawika, Lintang and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
   title        = {A framework for few-shot language model evaluation},
   month        = sep,
   year         = 2021,
-- 
GitLab


From 2efac405e4db9271671b8ad9e78f5e29047fab86 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 1 Nov 2023 16:19:34 -0400
Subject: [PATCH 150/212] Update README.md

---
 README.md | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2622a62d..6ac931e0 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,12 @@
 # Language Model Evaluation Harness
 
-## Notice to Users
-(as of 6/15/23)
-We have a revamp of the Evaluation Harness library internals staged on the [big-refactor](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) branch! It is far along in progress, but before we start to move the `master` branch of the repository over to this new design with a new version release, we'd like to ensure that it's been tested by outside users and there are no glaring bugs.
-
-We’d like your help to test it out! you can help by:
-1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
-2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
-
-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
-- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
-- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
-
-Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
-
 ## Overview
 
 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
 
 Features:
 
-- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
+- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
@@ -58,7 +44,6 @@ To install the package with all extras, run
 pip install -e ".[all]"
 ```
 
-
 ## Support
 
 The best way to get support is to open an issue on this repo or join the EleutherAI discord server](discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases.
-- 
GitLab


From 70b09fd9a148a05c865e429adc58d20a441837c8 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 1 Nov 2023 16:20:34 -0400
Subject: [PATCH 151/212] Update README.md

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6ac931e0..6b8ae98d 100644
--- a/README.md
+++ b/README.md
@@ -216,12 +216,6 @@ python -m lm_eval \
 
 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
 
-## Implementing new tasks
-
-To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
-
-
-As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in [the task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md) and [the advanced task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md) and welcome contributions of novel task templates and task variants.
 
 ## How to Contribute or Learn More?
 
@@ -230,6 +224,13 @@ For more information on the library and how everything fits together, check out
 
 You can also ask for help, or discuss new features with the maintainers in the #lm-thunderdome channel of the EleutherAI discord! If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!
 
+### Implementing new tasks
+
+To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
+
+
+As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in [the task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md) and [the advanced task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md) and welcome contributions of novel task templates and task variants.
+
 
 ## Cite as
 
-- 
GitLab


From 06faed0cde9c9e2c48ab1531badbf722c8404c22 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 1 Nov 2023 16:21:40 -0400
Subject: [PATCH 152/212] Update CODEOWNERS

---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 9e637546..35ca63fe 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-* @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika @StellaAthena
-- 
GitLab


From d50a2ad61d7d7da0694585ec50b9214fcd8e670e Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Wed, 1 Nov 2023 17:11:50 -0400
Subject: [PATCH 153/212] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6b8ae98d..92af820c 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop
 
 ## Install
 
-To install the `lm-eval` refactor branch from the github repository, run:
+To install the `lm-eval` package from the github repository, run:
 
 ```bash
 git clone https://github.com/EleutherAI/lm-evaluation-harness
@@ -141,7 +141,7 @@ A full accounting of the supported and planned libraries + APIs can be seen belo
 | API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                    | Request Types:                                           |
 |-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
 | OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `generate_until` (no logprobs)                             |
+| OpenAI ChatCompletions      | :x: Not yet - needs testing!       | N/A                                                                              | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                         | `generate_until` (no logprobs)                             |
 | Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `generate_until` (no logprobs)                             |
 | GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-- 
GitLab


From 56a2c8e3b72fbaa298bf46c8f17f1cdefb2a6c9b Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Wed, 1 Nov 2023 17:19:56 -0400
Subject: [PATCH 154/212] describe local dataset usage in docs

---
 docs/new_task_guide.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index d0b2b429..6b30bfc9 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -45,6 +45,16 @@ dataset_name: ... # the dataset configuration to use. Leave `null` if your datas
 dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
 ```
 
+------------------------------
+**Tip:** To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+```
+dataset_path: json
+dataset_name: null
+dataset_kwargs: 
+  data_files: /path/to/my/json
+```
+-------------------------------
+
 Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:
 
 ```yaml
-- 
GitLab


From 4ac7f064d186bb8dfc72350a1a6973f20b0ed2e5 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Wed, 1 Nov 2023 19:00:56 -0400
Subject: [PATCH 155/212] remove `--huggingface_login`

---
 lm_eval/__main__.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index f57d712c..aaf98419 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -20,7 +20,6 @@ def _handle_non_serializable(o):
     elif isinstance(o, set):
         return list(o)
     else:
-        print(f"Object of type {o.__class__.__name__} is not JSON serializable,just stringify it")
         return str(o)
 
 
@@ -112,12 +111,6 @@ def parse_eval_args() -> argparse.Namespace:
         default="INFO",
         help="Log error when tasks are not registered.",
     )
-    parser.add_argument(
-        "--huggingface_login",
-        action="store_true",
-        default=False,
-        help="huggingface token for downloading some authorization datasets, like toxigen, you need add HUGGINGFACE_LOGIN_TOKEN to environment variable firstly. https://huggingface.co/settings/tokens",
-    )
     return parser.parse_args()
 
 
@@ -134,14 +127,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             " --limit SHOULD ONLY BE USED FOR TESTING."
             "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
-    if args.huggingface_login:
-        from huggingface_hub import login
-
-        assert (
-            "HUGGINGFACE_LOGIN_TOKEN" in os.environ
-        ), "Your environment variable does not contain a HUGGINGFACE_LOGIN_TOKEN. Please set the token first."
-        huggingface_token = os.environ["HUGGINGFACE_LOGIN_TOKEN"]
-        login(token=huggingface_token)
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
         include_path(args.include_path)
-- 
GitLab


From 29ba8cb16e272084a34bce21a724ccfc2c100e8e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 06:03:14 +0000
Subject: [PATCH 156/212] fixing display name

---
 lm_eval/evaluator.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 81f4efd7..4f3544c8 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -230,8 +230,12 @@ def evaluate(
             task_hierarchy[group_name].append(task_name)
             versions[group_name] = "N/A"
 
-            if "group_alias" in configs[task_name]:
+            if ("group_alias" in configs[task_name]) and (
+                group_name not in task_group_alias
+            ):
+                print(group_name)
                 task_group_alias[group_name] = configs[task_name]["group_alias"]
+                print(task_group_alias)
 
         else:
             task_hierarchy[task_name] = []
@@ -537,11 +541,11 @@ def evaluate(
             for group_name, task_list in task_hierarchy.items():
 
                 order = task_order[group_name]
-                results_agg[group_name] = results[group_name]
+                results_agg[group_name] = results[group_name].copy()
                 results_agg[group_name]["tab"] = order
 
                 if (order < max(task_order.values())) and (len(task_list) > 0):
-                    groups_agg[group_name] = results[group_name]
+                    groups_agg[group_name] = results[group_name].copy()
                     groups_agg[group_name]["tab"] = order
 
                 if task_list != []:
@@ -564,36 +568,41 @@ def evaluate(
             task_hierarchy, task_order, versions, task_group_alias
         )
 
+        print("task_group_alias")
+        print(task_group_alias)
+
         _results_agg = collections.defaultdict(dict)
         _versions = collections.defaultdict(dict)
         for task in results_agg:
             task_results = results_agg[task]
+            tab_string = ""
             if "tab" in task_results:
                 tab = task_results.pop("tab")
-                tab_string = " "*tab+"-" if tab > 0 else ""
+                tab_string = " " * tab + "-" if tab > 0 else ""
 
             if task in task_group_alias:
                 task_alias = task_group_alias[task]
-                _results_agg[tab_string+task_alias] = task_results
-                _versions[tab_string+task_alias] = versions[task]
+                _results_agg[tab_string + task_alias] = task_results
+                _versions[tab_string + task_alias] = versions[task]
             else:
-                _results_agg[tab_string+task] = task_results
-                _versions[tab_string+task] = versions[task]
+                _results_agg[tab_string + task] = task_results
+                _versions[tab_string + task] = versions[task]
         results_agg = _results_agg
         versions = _versions
 
         _groups_agg = collections.defaultdict(dict)
         for group in groups_agg:
             group_results = groups_agg[group]
+            tab_string = ""
             if "tab" in group_results:
                 tab = group_results.pop("tab")
-                tab_string = " "*tab+"-" if tab > 0 else ""
+                tab_string = " " * tab + "-" if tab > 0 else ""
 
             if group in task_group_alias:
                 group_alias = task_group_alias[group]
-                _groups_agg[tab_string+group_alias] = group_results
+                _groups_agg[tab_string + group_alias] = group_results
             else:
-                _groups_agg[tab_string+group] = group_results
+                _groups_agg[tab_string + group] = group_results
         groups_agg = _groups_agg
 
         results_dict = {
-- 
GitLab


From d1e7a30a13a709148f78df214b33a41c89a2f9db Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 06:42:42 +0000
Subject: [PATCH 157/212] fixed stderr calculation

---
 lm_eval/evaluator.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 4f3544c8..3322297d 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -230,13 +230,6 @@ def evaluate(
             task_hierarchy[group_name].append(task_name)
             versions[group_name] = "N/A"
 
-            if ("group_alias" in configs[task_name]) and (
-                group_name not in task_group_alias
-            ):
-                print(group_name)
-                task_group_alias[group_name] = configs[task_name]["group_alias"]
-                print(task_group_alias)
-
         else:
             task_hierarchy[task_name] = []
 
@@ -249,6 +242,11 @@ def evaluate(
         if "task_alias" in configs[task_name]:
             task_group_alias[task_name] = configs[task_name]["task_alias"]
 
+        if ("group_alias" in configs[task_name]) and (
+            group_name not in task_group_alias
+        ):
+            task_group_alias[group_name] = configs[task_name]["group_alias"]
+
         if limit is not None:
             if task.has_test_docs():
                 task_docs = task.test_docs()
@@ -502,6 +500,7 @@ def evaluate(
 
                             stderr = "_stderr,".join(metric.split(","))
                             stderr_score = results[task][stderr]
+                            var_score = stderr_score**2
                             metric_score = results[task][metric]
 
                             all_stderr.append(stderr)
@@ -514,7 +513,7 @@ def evaluate(
                                 # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
                                 results[group][stderr] = (
                                     (total_size - 1) * results[group][stderr]
-                                    + (current_size - 1) * stderr_score
+                                    + (current_size - 1) * var_score
                                 ) / (
                                     total_size + current_size - 1
                                 ) + total_size * current_size / (
@@ -525,7 +524,7 @@ def evaluate(
                                 ) ** 2
                             else:
                                 results[group][metric] = metric_score
-                                results[group][stderr] = stderr_score
+                                results[group][stderr] = var_score
 
                         total_size += current_size
 
-- 
GitLab


From 6dfe848e23c596cc968e254180a4ebb06fe26342 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 06:42:53 +0000
Subject: [PATCH 158/212] added group and task alias

---
 lm_eval/tasks/mmlu/_generate_configs.py                      | 2 ++
 lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml                 | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml               | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml         | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml      | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml         | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml       | 5 ++++-
 .../tasks/mmlu/default/mmlu_college_computer_science.yaml    | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml     | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml         | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml       | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml      | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml            | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml            | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml            | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml     | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml   | 5 ++++-
 .../mmlu/default/mmlu_high_school_computer_science.yaml      | 5 ++++-
 .../mmlu/default/mmlu_high_school_european_history.yaml      | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml   | 5 ++++-
 .../default/mmlu_high_school_government_and_politics.yaml    | 5 ++++-
 .../tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml | 5 ++++-
 .../tasks/mmlu/default/mmlu_high_school_microeconomics.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml     | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml  | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml  | 5 ++++-
 .../tasks/mmlu/default/mmlu_high_school_world_history.yaml   | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml             | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml         | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_international_law.yaml       | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml           | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml       | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_management.yaml              | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_marketing.yaml               | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml           | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml          | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml         | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml               | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml              | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml              | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml   | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml        | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_sociology.yaml               | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml       | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_virology.yaml                | 5 ++++-
 lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml         | 5 ++++-
 .../loglikelihood/_mmlu_flan_loglikelihood_template_yaml     | 2 +-
 59 files changed, 231 insertions(+), 58 deletions(-)

diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 1ea16ece..2bf27ac0 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -112,9 +112,11 @@ if __name__ == "__main__":
             "group": f"mmlu_{args.task_prefix}_{category}"
             if args.task_prefix != ""
             else f"mmlu_{category}",
+            "group_alias": category.replace("_", " "),
             "task": f"mmlu_{args.task_prefix}_{subject}"
             if args.task_prefix != ""
             else f"mmlu_{subject}",
+            "task_alias": subject.replace("_", " "),
             "dataset_name": subject,
             "description": description,
         }
diff --git a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
index bb786cc8..90f3cc50 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
index 22eaa7fd..0e9e09b2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_anatomy"
+"task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
index 64f20d20..e3bdfc95 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_astronomy"
+"task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
index 49330917..ea0d1fe2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_business_ethics"
+"task_alias": "business_ethics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
index 547aeccf..20bab147 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_clinical_knowledge"
+"task_alias": "clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
index 69826397..afb4d9c6 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_biology"
+"task_alias": "college_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
index b91c07f6..a7de3532 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_chemistry"
+"task_alias": "college_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
index a89a46aa..9786cc6e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_computer_science"
+"task_alias": "college_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
index c452ff97..e7699f8b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_mathematics"
+"task_alias": "college_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
index d696a40d..df9e8901 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_college_medicine"
+"task_alias": "college_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
index 16046e53..3c5e7462 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_physics"
+"task_alias": "college_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
index 923967ae..df9c4a51 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_computer_security"
+"task_alias": "computer_security"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
index 88096f3b..8ab59ed1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_conceptual_physics"
+"task_alias": "conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
index 4c43a5c8..a974fc84 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_econometrics"
+"task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
index 27ab4828..9c45cc61 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_electrical_engineering"
+"task_alias": "electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
index bd7106e4..2154ab65 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_elementary_mathematics"
+"task_alias": "elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
index 98486ebe..689d3d1f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_formal_logic"
+"task_alias": "formal_logic"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
index 9db3f3d3..60b5c129 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_global_facts"
+"task_alias": "global_facts"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
index 0ed8c0e7..c7e055dc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_biology"
+"task_alias": "high_school_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
index 7aa6037d..8e9421c1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_chemistry"
+"task_alias": "high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
index 9cf212af..87ec15cc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_computer_science"
+"task_alias": "high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
index e9189bd9..be0d696a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_european_history"
+"task_alias": "high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
index 7573c8c2..57c5261a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_geography"
+"task_alias": "high_school_geography"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
index 83d7d498..2e92f152 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_government_and_politics"
+"task_alias": "high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
index 4e8269b3..988d132a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_macroeconomics"
+"task_alias": "high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
index 2b9d3216..f7c07a60 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_mathematics"
+"task_alias": "high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
index 3206bfdf..5339a023 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_microeconomics"
+"task_alias": "high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
index 27a1e51a..0fae0405 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_physics"
+"task_alias": "high_school_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
index 1e0b1628..31ecb18e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_psychology"
+"task_alias": "high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
index 4244ea8f..54d70880 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_statistics"
+"task_alias": "high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
index a6f085ec..e4432fe4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_us_history"
+"task_alias": "high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
index c23d93c8..08773a20 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_world_history"
+"task_alias": "high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
index 1478d3dd..c9e1feb1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_human_aging"
+"task_alias": "human_aging"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
index ab56a035..715859a1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_human_sexuality"
+"task_alias": "human_sexuality"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
index 7a352701..68765225 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_international_law"
+"task_alias": "international_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
index af29fae3..e16de5c4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_jurisprudence"
+"task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
index 570fd3dd..8b12057b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_logical_fallacies"
+"task_alias": "logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
index 11166e2f..2387d680 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
 "group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_machine_learning"
+"task_alias": "machine_learning"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_management.yaml b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
index 745ac762..d0cdc812 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_management"
+"task_alias": "management"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
index 38401dc8..a614db29 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_marketing"
+"task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
index 2e4fbbd8..5d7ce708 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_medical_genetics"
+"task_alias": "medical_genetics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
index aa674180..77e819cf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_miscellaneous"
+"task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
index ac8bbdb9..2df1a1dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_disputes"
+"task_alias": "moral_disputes"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
index 33a249c2..6da63cb2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_scenarios"
+"task_alias": "moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
index 44b799cd..df70fbb2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_nutrition"
+"task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
index 5a703cc4..9dba09c1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_philosophy"
+"task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
index dc8e65b8..d787898c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_prehistory"
+"task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
index c59ccffd..3443c336 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_accounting"
+"task_alias": "professional_accounting"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
index 46a3ebbd..f3a02631 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_law"
+"task_alias": "professional_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
index fe52278d..e8c49b5e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_medicine"
+"task_alias": "professional_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
index ff7bb1f7..ec48a06f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_psychology"
+"task_alias": "professional_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
index 290a85f5..db36fb49 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_public_relations"
+"task_alias": "public_relations"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
index d1a41871..072dfd70 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_security_studies"
+"task_alias": "security_studies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
index be1e46f5..efcbd27b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_sociology"
+"task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
index f94e8bc0..d80ee94a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
 "group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_us_foreign_policy"
+"task_alias": "us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
index 4fdc1bf6..d935f92a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
 "group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_virology"
+"task_alias": "virology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
index 870ea78d..8681354f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
@@ -1,5 +1,8 @@
 "dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
 "group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_world_religions"
+"task_alias": "world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index 2d5d92ef..5db2981a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -12,4 +12,4 @@ metric_list:
     higher_is_better: true
   - metric: acc_norm
     aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
-- 
GitLab


From 6eac00565e3fdc2741c1048960f2e16dd308222d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 06:44:32 +0000
Subject: [PATCH 159/212] removed print

---
 lm_eval/evaluator.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 3322297d..76b2c475 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -567,9 +567,6 @@ def evaluate(
             task_hierarchy, task_order, versions, task_group_alias
         )
 
-        print("task_group_alias")
-        print(task_group_alias)
-
         _results_agg = collections.defaultdict(dict)
         _versions = collections.defaultdict(dict)
         for task in results_agg:
-- 
GitLab


From 0d1fb8a5e805bc7e3987ce301e1b835d1c551863 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Thu, 2 Nov 2023 15:34:16 +0800
Subject: [PATCH 160/212] Rename greedy_until_template_yaml to
 generate_until_template_yaml

---
 .../{greedy_until_template_yaml => generate_until_template_yaml}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lm_eval/tasks/bigbench/{greedy_until_template_yaml => generate_until_template_yaml} (100%)

diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml
similarity index 100%
rename from lm_eval/tasks/bigbench/greedy_until_template_yaml
rename to lm_eval/tasks/bigbench/generate_until_template_yaml
-- 
GitLab


From ca8b00ccee1cc0b891fc4af6e8b801e6c82d0e91 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 09:06:51 +0000
Subject: [PATCH 161/212] very rough way of utilzing verbosity

---
 lm_eval/__init__.py       |  3 ++-
 lm_eval/__main__.py       | 34 +++++++++++++++++++++++-----------
 lm_eval/evaluator.py      |  5 +----
 lm_eval/tasks/__init__.py |  9 +++++++--
 4 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/lm_eval/__init__.py b/lm_eval/__init__.py
index 317c0291..b5fe21ec 100644
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
@@ -1 +1,2 @@
-from .evaluator import evaluate, simple_evaluate
+# from .evaluator import evaluate, simple_evaluate
+# from .logger import eval_logger, SPACING
\ No newline at end of file
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index aaf98419..eb618b4e 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -1,18 +1,24 @@
 import os
 import re
 import json
-import fnmatch
-import argparse
 import logging
-from pathlib import Path
+import argparse
 import numpy as np
-from lm_eval import evaluator, utils
-from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger, SPACING
-from lm_eval.tasks import include_path
 
+from pathlib import Path
 from typing import Union
 
+import logging
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+eval_logger = logging.getLogger("lm-eval")
+
+SPACING = " " * 47
+
 
 def _handle_non_serializable(o):
     if isinstance(o, np.int64) or isinstance(o, np.int32):
@@ -29,7 +35,7 @@ def parse_eval_args() -> argparse.Namespace:
     parser.add_argument(
         "--tasks",
         default=None,
-        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+        # help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
     )
     parser.add_argument(
         "--model_args",
@@ -115,13 +121,19 @@ def parse_eval_args() -> argparse.Namespace:
 
 
 def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-    if not args:
-        # we allow for args to be passed externally, else we parse them ourselves
-        args = parse_eval_args()
+    # if not args:
+    #     # we allow for args to be passed externally, else we parse them ourselves
+    # from lm_eval.logger import eval_logger, SPACING
+
+    args = parse_eval_args()
 
     eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
+    from lm_eval import evaluator, utils
+    from lm_eval.api.registry import ALL_TASKS
+    from lm_eval.tasks import include_path
+
     if args.limit:
         eval_logger.warning(
             " --limit SHOULD ONLY BE USED FOR TESTING."
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index bf35097c..750aa426 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -25,10 +25,6 @@ from lm_eval.utils import (
 
 from lm_eval.logger import eval_logger
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stdout))
-
 
 @positional_deprecated
 def simple_evaluate(
@@ -46,6 +42,7 @@ def simple_evaluate(
     decontamination_ngrams_path=None,
     write_out: bool = False,
     log_samples: bool = True,
+    verbosity: str = "INFO",
 ):
     """Instantiate and evaluate a model on a list of tasks.
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index d36b6077..c4104418 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -16,7 +16,9 @@ from lm_eval.api.registry import (
 import logging
 
 eval_logger = logging.getLogger("lm-eval")
-
+# from lm_eval.logger import eval_logger
+# print("tasks.py eval_logger.level")
+print(eval_logger.level)
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
@@ -141,8 +143,11 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                         else:
                             if type(config["task"]) == list:
                                 register_configurable_group(config, yaml_path)
+
+                # Log this silently and show it only when 
+                # the user defines the appropriate verbosity.
                 except ModuleNotFoundError as e:
-                    eval_logger.warning(
+                    eval_logger.debug(
                         f"{yaml_path}: {e}. Config will not be added to registry."
                     )
                 except Exception as error:
-- 
GitLab


From 7ec8248520246012a02660eb1b98d135fc1c5d36 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 10:12:25 +0000
Subject: [PATCH 162/212] changed the order of import to allow the verbosity to
 take effect on other parts of the repo

---
 lm_eval/__main__.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index eb618b4e..8bcf1d94 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -1,5 +1,6 @@
 import os
 import re
+import sys
 import json
 import logging
 import argparse
@@ -10,14 +11,13 @@ from typing import Union
 
 import logging
 
+SPACING = " " * 47
+
 logging.basicConfig(
     format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
     datefmt="%Y-%m-%d:%H:%M:%S",
     level=logging.INFO,
 )
-eval_logger = logging.getLogger("lm-eval")
-
-SPACING = " " * 47
 
 
 def _handle_non_serializable(o):
@@ -28,14 +28,13 @@ def _handle_non_serializable(o):
     else:
         return str(o)
 
-
 def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
+    parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
     parser.add_argument(
         "--tasks",
         default=None,
-        # help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+        help="To get full list of tasks, use the command lm-eval --tasks list"
     )
     parser.add_argument(
         "--model_args",
@@ -121,18 +120,18 @@ def parse_eval_args() -> argparse.Namespace:
 
 
 def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-    # if not args:
-    #     # we allow for args to be passed externally, else we parse them ourselves
-    # from lm_eval.logger import eval_logger, SPACING
-
-    args = parse_eval_args()
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
 
+    eval_logger = logging.getLogger("lm-eval")
     eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     from lm_eval import evaluator, utils
-    from lm_eval.api.registry import ALL_TASKS
     from lm_eval.tasks import include_path
+    from lm_eval.api.registry import ALL_TASKS
 
     if args.limit:
         eval_logger.warning(
@@ -145,6 +144,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
 
     if args.tasks is None:
         task_names = ALL_TASKS
+    elif args.tasks == "list":
+        eval_logger.info("Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS))))
+        sys.exit()
     else:
         if os.path.isdir(args.tasks):
             import glob
-- 
GitLab


From f701ba7ddc00db78e120cbcd2daf64a445e08259 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 10:12:58 +0000
Subject: [PATCH 163/212] eval_logger is not imported from logger.py anymore

---
 lm_eval/api/metrics.py    |  2 ++
 lm_eval/api/model.py      |  4 +++-
 lm_eval/api/registry.py   |  4 +++-
 lm_eval/api/task.py       |  3 ++-
 lm_eval/evaluator.py      |  4 ++--
 lm_eval/logger.py         | 10 ----------
 lm_eval/tasks/__init__.py |  3 ---
 lm_eval/utils.py          |  3 ++-
 8 files changed, 14 insertions(+), 19 deletions(-)
 delete mode 100644 lm_eval/logger.py

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 69e66fdc..c560256a 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -9,6 +9,8 @@ import evaluate
 
 from lm_eval.api.registry import register_metric, register_aggregation
 
+import logging
+eval_logger = logging.getLogger("lm-eval")
 
 # Register Aggregations First
 @register_aggregation("mean")
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index c24026ac..4b91ae99 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -10,7 +10,9 @@ import hashlib
 from tqdm import tqdm
 
 from lm_eval import utils
-from lm_eval.logger import eval_logger
+
+import logging
+eval_logger = logging.getLogger("lm-eval")
 
 T = TypeVar("T", bound="LM")
 
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 4e78048b..4212a4ad 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -1,7 +1,9 @@
 import os
 import evaluate
 from lm_eval.api.model import LM
-from lm_eval.logger import eval_logger
+
+import logging
+eval_logger = logging.getLogger("lm-eval")
 
 MODEL_REGISTRY = {}
 
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 883c643d..c4b0f44a 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -21,7 +21,6 @@ from lm_eval.api import samplers
 from lm_eval.api.instance import Instance
 from lm_eval.api.filter import FilterEnsemble
 
-from lm_eval.logger import eval_logger
 from lm_eval.prompts import get_prompt
 from lm_eval.filters import build_filter_ensemble
 from lm_eval.api.metrics import (
@@ -47,6 +46,8 @@ ALL_OUTPUT_TYPES = [
     "generate_until",
 ]
 
+import logging
+eval_logger = logging.getLogger("lm-eval")
 
 @dataclass
 class TaskConfig(dict):
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 750aa426..a361131c 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -23,7 +23,7 @@ from lm_eval.utils import (
     get_git_commit_hash,
 )
 
-from lm_eval.logger import eval_logger
+eval_logger = logging.getLogger("lm-eval")
 
 
 @positional_deprecated
@@ -246,7 +246,7 @@ def evaluate(
 
         task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
 
-        eval_logger.info(
+        eval_logger.debug(
             f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
         )
 
diff --git a/lm_eval/logger.py b/lm_eval/logger.py
deleted file mode 100644
index 129c112e..00000000
--- a/lm_eval/logger.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-eval_logger = logging.getLogger("lm-eval")
-
-SPACING = " " * 47
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c4104418..5d5678da 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -16,9 +16,6 @@ from lm_eval.api.registry import (
 import logging
 
 eval_logger = logging.getLogger("lm-eval")
-# from lm_eval.logger import eval_logger
-# print("tasks.py eval_logger.level")
-print(eval_logger.level)
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index d246470a..c5dcb5d3 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -19,7 +19,8 @@ import transformers
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
 
-from lm_eval.logger import eval_logger
+import logging
+eval_logger = logging.getLogger("lm-eval")
 
 
 def escaped_split(text, sep_char, maxsplit=-1):
-- 
GitLab


From 73f3029c1eaaaf67481b8650c5edf12baab3a0d8 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 2 Nov 2023 10:59:23 +0000
Subject: [PATCH 164/212] precommit format

---
 docs/new_task_guide.md    |  2 +-
 lm_eval/__init__.py       |  2 +-
 lm_eval/__main__.py       | 15 +++++++--------
 lm_eval/api/metrics.py    |  1 +
 lm_eval/api/model.py      |  1 +
 lm_eval/api/registry.py   |  1 +
 lm_eval/api/task.py       |  4 +++-
 lm_eval/tasks/__init__.py |  3 ++-
 lm_eval/utils.py          |  1 +
 9 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 6b30bfc9..86966be5 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -50,7 +50,7 @@ dataset_kwargs: null # any extra keyword arguments that should be passed to the
 ```
 dataset_path: json
 dataset_name: null
-dataset_kwargs: 
+dataset_kwargs:
   data_files: /path/to/my/json
 ```
 -------------------------------
diff --git a/lm_eval/__init__.py b/lm_eval/__init__.py
index b5fe21ec..323f916f 100644
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
@@ -1,2 +1,2 @@
 # from .evaluator import evaluate, simple_evaluate
-# from .logger import eval_logger, SPACING
\ No newline at end of file
+# from .logger import eval_logger, SPACING
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 8bcf1d94..1ece1757 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -9,10 +9,6 @@ import numpy as np
 from pathlib import Path
 from typing import Union
 
-import logging
-
-SPACING = " " * 47
-
 logging.basicConfig(
     format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
     datefmt="%Y-%m-%d:%H:%M:%S",
@@ -28,13 +24,14 @@ def _handle_non_serializable(o):
     else:
         return str(o)
 
+
 def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
     parser.add_argument(
         "--tasks",
         default=None,
-        help="To get full list of tasks, use the command lm-eval --tasks list"
+        help="To get full list of tasks, use the command lm-eval --tasks list",
     )
     parser.add_argument(
         "--model_args",
@@ -145,7 +142,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
     if args.tasks is None:
         task_names = ALL_TASKS
     elif args.tasks == "list":
-        eval_logger.info("Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS))))
+        eval_logger.info(
+            "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
+        )
         sys.exit()
     else:
         if os.path.isdir(args.tasks):
@@ -169,10 +168,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                 missing = ", ".join(task_missing)
                 eval_logger.error(
                     f"Tasks were not found: {missing}\n"
-                    f"{SPACING}Try `lm-eval -h` for list of available tasks",
+                    f"{' ' * 47}Try `lm-eval --tasks list` for list of available tasks",
                 )
                 raise ValueError(
-                    f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks."
+                    f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
                 )
 
     if args.output_path:
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index c560256a..4eb68585 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -10,6 +10,7 @@ import evaluate
 from lm_eval.api.registry import register_metric, register_aggregation
 
 import logging
+
 eval_logger = logging.getLogger("lm-eval")
 
 # Register Aggregations First
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 4b91ae99..a8aef53e 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -12,6 +12,7 @@ from tqdm import tqdm
 from lm_eval import utils
 
 import logging
+
 eval_logger = logging.getLogger("lm-eval")
 
 T = TypeVar("T", bound="LM")
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 4212a4ad..7d73ae6c 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -3,6 +3,7 @@ import evaluate
 from lm_eval.api.model import LM
 
 import logging
+
 eval_logger = logging.getLogger("lm-eval")
 
 MODEL_REGISTRY = {}
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index c4b0f44a..e3bf1cae 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field, asdict
 import re
 import ast
 import yaml
+import logging
 import evaluate
 import random
 import itertools
@@ -46,9 +47,10 @@ ALL_OUTPUT_TYPES = [
     "generate_until",
 ]
 
-import logging
+
 eval_logger = logging.getLogger("lm-eval")
 
+
 @dataclass
 class TaskConfig(dict):
     # task naming/registry
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 5d5678da..33727058 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,6 +17,7 @@ import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
+
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
         config["task"] + "ConfigurableTask",
@@ -141,7 +142,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                             if type(config["task"]) == list:
                                 register_configurable_group(config, yaml_path)
 
-                # Log this silently and show it only when 
+                # Log this silently and show it only when
                 # the user defines the appropriate verbosity.
                 except ModuleNotFoundError as e:
                     eval_logger.debug(
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index c5dcb5d3..add1ed66 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -20,6 +20,7 @@ from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
 
 import logging
+
 eval_logger = logging.getLogger("lm-eval")
 
 
-- 
GitLab


From ad8eee89a917d60032c5484c04cbc32284f6670a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 2 Nov 2023 18:34:26 +0000
Subject: [PATCH 165/212] cleanup hf tqdm

---
 lm_eval/models/huggingface.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 54cf98a4..8feb8cfa 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -676,9 +676,7 @@ class HFLM(LM):
         )
 
         pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
-        for (
-            chunk
-        ) in chunks:  # tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
+        for chunk in chunks:
             inps = []
             cont_toks_list = []
             inplens = []
@@ -863,7 +861,7 @@ class HFLM(LM):
                 if self.batch_size == "auto" and not adaptive_batch_size
                 else None,
             )
-            for chunk in tqdm(chunks, disable=self.rank != 0):
+            for chunk in chunks:
                 contexts, all_gen_kwargs = zip(*chunk)
                 # we assume all gen kwargs in the batch are the same
                 # this is safe to assume because the `grouper` object ensures it.
-- 
GitLab


From 37ac5f468d2d67dddc2b68c4ea047a5e8bae20b5 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 3 Nov 2023 16:52:59 +0000
Subject: [PATCH 166/212] remove gold_alias from codebase

---
 lm_eval/api/task.py                           | 21 -------------------
 lm_eval/tasks/gsm8k/gsm8k-cot.yaml            |  5 +++--
 lm_eval/tasks/gsm8k/gsm8k.yaml                |  4 ++--
 .../utilitarianism_original_yaml              |  1 -
 lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py |  9 --------
 5 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 883c643d..256a4671 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -69,7 +69,6 @@ class TaskConfig(dict):
     doc_to_text: Union[Callable, str] = None
     doc_to_target: Union[Callable, str] = None
     doc_to_choice: Union[Callable, str, dict, list] = None
-    gold_alias: Union[Callable, str] = None
     process_results: Union[Callable, str] = None
     use_prompt: str = None
     description: str = ""
@@ -893,26 +892,6 @@ class ConfigurableTask(Task):
         else:
             raise TypeError
 
-    def gold_alias(self, doc):
-        # returns a version of the gold target answer to a document,
-        # which should be passed into metric for scoring as the ground truth.
-
-        # in multiple_choice tasks, this should be castable to an int corresponding to the index
-        # within the answer choices, while doc_to_target is the string version of {{answer_choices[gold]}}.
-        if self.config.gold_alias is not None:
-            doc_to_target = self.config.gold_alias
-        else:
-            return self.doc_to_target(doc)
-
-        if type(doc_to_target) == str:
-            return utils.apply_template(doc_to_target, doc)
-        elif callable(doc_to_target):
-            return doc_to_target(doc)
-        elif hasattr(doc_to_target, "apply"):
-            return doc_to_target.apply(doc)[1]
-        else:
-            raise TypeError
-
     def construct_requests(
         self, doc: dict, ctx: str, **kwargs
     ) -> Union[List[Instance], Instance]:
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index 92555a57..b318eed3 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -14,17 +14,18 @@ Q: There were nine computers in the server room. Five more computers were instal
 Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
 Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
 Q: {{question}}\n\nA:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
+doc_to_target: " {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
     ignore_case: true
+    ignore_whitespace: true
     ignore_punctuation: false
     regexes_to_ignore:
       - ","
       - "\\$"
+      - ".*### "
 generation_kwargs:
   until:
     - "Q:"
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index 124f708d..45c248ae 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -1,6 +1,6 @@
 group:
   - math_word_problems
-task: gsm8k_yaml
+task: gsm8k
 dataset_path: gsm8k
 dataset_name: main
 output_type: generate_until
@@ -9,12 +9,12 @@ fewshot_split: train
 test_split: test
 doc_to_text: "Question: {{question}}\nAnswer:"
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
 metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
     ignore_case: true
+    ignore_whitespace: true
     ignore_punctuation: false
     regexes_to_ignore:
       - ","
diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
index a7e712cc..04b433f6 100644
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
@@ -9,7 +9,6 @@
 # template_aliases:  #"{% set answer_choices = range(1, 11)|list %}"
 # doc_to_text: 'Activity: "{{activity}}"\nRating:'
 # doc_to_target: "{{answer_choices[label]}}"
-# gold_alias: "{{label}}" # this will be cast to an int.
 # metric_list:
 #   - metric: acc
 # TODO: we want this to be implemented as a winograd_schema task type, actually
diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
index 516f0e2f..51c19870 100644
--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -3,12 +3,3 @@ def doc_to_text(doc) -> str:
     return "Abstract: {}\nQuestion: {}\nAnswer:".format(
         ctxs, doc["QUESTION"], doc["final_decision"]
     )
-
-
-def doc_to_target(doc) -> str:
-    return " {}".format(doc["final_decision"])
-
-
-def gold_alias(doc):
-    dict_to_label = {"yes": 0, "no": 1, "maybe": 2}
-    return dict_to_label[doc["final_decision"]]
-- 
GitLab


From b9f0d0d38656c5f4260d74585cf9f779d5026960 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 3 Nov 2023 16:55:17 +0000
Subject: [PATCH 167/212] make sure gold_alias not in docs

---
 docs/task_guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/task_guide.md b/docs/task_guide.md
index 5d63c15d..30ae038f 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -20,12 +20,12 @@ Task naming + registration:
 
 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
-- **dataset_name**  (`str`, *optional*, defaults to None) — The name of, what HF calls, a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
+- **dataset_name**  (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
 - **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
 - **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
 - **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
 - **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
-- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0.
 - **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template.
 
 Prompting / in-context formatting options:
-- 
GitLab


From 4d9b928ef283a9c83c40f0994d1fc4c4aa448811 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 3 Nov 2023 14:03:05 -0400
Subject: [PATCH 168/212] Update _generate_configs.py

---
 lm_eval/tasks/model_written_evals/persona/_generate_configs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
index 949118f1..a21f2830 100644
--- a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
@@ -4,6 +4,7 @@ import datasets
 
 from tqdm import tqdm
 
+
 def main() -> None:
 
     dataset_path = "EleutherAI/persona"
-- 
GitLab


From 9f09016db25c09d3e5dff2a0e473f29f6108118e Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 3 Nov 2023 14:03:40 -0400
Subject: [PATCH 169/212] Update ignore.txt

---
 ignore.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignore.txt b/ignore.txt
index c93b98d1..de10b539 100644
--- a/ignore.txt
+++ b/ignore.txt
@@ -5,4 +5,4 @@ maka
 mor
 te
 ond
-extraversion
\ No newline at end of file
+extraversion
-- 
GitLab


From 93d088c8461f7a68d91f4985bdd935188c90e178 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 3 Nov 2023 18:26:20 +0000
Subject: [PATCH 170/212] fix gsm8k regexes

---
 lm_eval/tasks/gsm8k/gsm8k-cot.yaml |  6 +++---
 lm_eval/tasks/gsm8k/gsm8k.yaml     | 15 +++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index b318eed3..71381400 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -20,12 +20,12 @@ metric_list:
     aggregation: mean
     higher_is_better: true
     ignore_case: true
-    ignore_whitespace: true
     ignore_punctuation: false
     regexes_to_ignore:
       - ","
       - "\\$"
-      - ".*### "
+      - "(?s).*#### "
+      - "\n\n"
 generation_kwargs:
   until:
     - "Q:"
@@ -38,5 +38,5 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
       - function: "take_first"
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index 45c248ae..9cf16158 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -14,12 +14,11 @@ metric_list:
     aggregation: mean
     higher_is_better: true
     ignore_case: true
-    ignore_whitespace: true
     ignore_punctuation: false
     regexes_to_ignore:
       - ","
       - "\\$"
-      - ".*### "
+      - "(?s).*#### "
 generation_kwargs:
   until:
     - "\n\n"
@@ -28,9 +27,9 @@ generation_kwargs:
   temperature: 0.0
 repeats: 1
 num_fewshot: 5
-# filter_list:
-#   - name: "get-answer"
-#     filter:
-#       - function: "regex"
-#         regex_pattern: "### (\\-?[0-9\\.\\,]+)"
-#       - function: "take_first"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
-- 
GitLab


From 5f3b8bf668047ce6433c4f93c26a32499c78d992 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 3 Nov 2023 19:52:51 +0000
Subject: [PATCH 171/212] fix bug with args.verbose

---
 lm_eval/__main__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 5ddc1c4f..aaf98419 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -182,7 +182,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         assert args.output_path, "Specify --output_path"
 
     eval_logger.info(f"Selected Tasks: {task_names}")
-    eval_logger.verbose = args.verbose
 
     results = evaluator.simple_evaluate(
         model=args.model,
-- 
GitLab


From b2b6a90bfef5cf3177ebb668c13c6f8f4bef9c39 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 3 Nov 2023 20:20:05 +0000
Subject: [PATCH 172/212] upstream GGUF/llama.cpp model to big-refactor

---
 README.md                  |   2 +-
 lm_eval/models/__init__.py |   1 +
 lm_eval/models/gguf.py     | 125 +++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 lm_eval/models/gguf.py

diff --git a/README.md b/README.md
index 92af820c..8fc81fb6 100644
--- a/README.md
+++ b/README.md
@@ -146,7 +146,7 @@ A full accounting of the supported and planned libraries + APIs can be seen belo
 | GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| GGML/[Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python))                        | :heavy_check_mark:              | `gguf`, `ggml`                                                | Llama-architecture models (Llama, Llama 2, Llemma, Mistral(?), Llama finetunes)                               | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `generate_until` (no logprobs)                             |
 | Your inference server here! | ...                             | ...                                                                              | ...                                  | ...                                                      |                                | ...                                                      |
 
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index fa44aff8..61a0775f 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@ from . import openai_completions
 from . import textsynth
 from . import dummy
 from . import anthropic_llms
+from . import gguf
 
 
 # TODO: implement __all__
diff --git a/lm_eval/models/gguf.py b/lm_eval/models/gguf.py
new file mode 100644
index 00000000..5ae154f3
--- /dev/null
+++ b/lm_eval/models/gguf.py
@@ -0,0 +1,125 @@
+import requests
+import logging
+import time
+from tqdm import tqdm
+from requests.exceptions import RequestException
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+logger = logging.getLogger(__name__)
+
+
+def get_result(logprobs, context_length):
+    is_greedy = True
+    offsets = logprobs["text_offset"]
+    tokens = logprobs["tokens"]
+    tokens_logprobs = logprobs["token_logprobs"]
+
+    idx = 0
+    while offsets[idx] < context_length:
+        idx += 1
+    continuation_logprobs = sum(tokens_logprobs[idx:-1])
+    for i in range(idx, len(tokens)):
+        token = tokens[i]
+        top_tokens = logprobs["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+
+    return continuation_logprobs, is_greedy
+
+
+@register_model("gguf", "ggml")
+class GGUFLM(LM):
+    def __init__(self, base_url=None, max_length=2048, **kwargs):
+        super().__init__()
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GGUF LM!"
+        self.logprobs = 10
+        self.temperature = 0.0
+        self.max_length = max_length
+
+    def gguf_completion(
+        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
+    ):
+        for _ in range(retries):
+            try:
+                prompt = context
+                request = {
+                    "prompt": prompt,
+                    "logprobs": self.logprobs,
+                    "temperature": self.temperature,
+                }
+                if continuation:
+                    prompt += continuation
+                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
+                if stop is not None:
+                    request["stop"] = stop
+                response = requests.post(
+                    f"{self.base_url}/v1/completions", json=request
+                )
+                response.raise_for_status()
+                return response.json()
+            except RequestException as e:
+                logger.error(f"RequestException: {e}")
+                time.sleep(delay)  # wait before retrying
+        else:
+            raise Exception(f"Failed to get a valid response after {retries} retries.")
+
+    def loglikelihood(self, requests):
+        if not requests:
+            return []
+        res = []
+        for context, continuation in tqdm([req.args for req in requests]):
+            response = self.gguf_completion(context=context, continuation=continuation)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                logprobs = choice.get("logprobs")
+                if (
+                    logprobs
+                    and "token_logprobs" in logprobs
+                    and logprobs["token_logprobs"]
+                ):
+                    logprob, is_greedy = get_result(logprobs, len(context))
+                    res.append((logprob, is_greedy))
+                else:
+                    logger.warning(
+                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
+                    )
+            else:
+                logger.error(
+                    f"Invalid response for loglikelihood. Response: {response}"
+                )
+                assert False
+        return res
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm([req.args for req in requests]):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args.get("until", ["</s>"])
+            response = self.gguf_completion(context=inp, stop=until)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                if "text" in choice:
+                    generated_text = choice["text"].strip()
+                    res.append(generated_text)
+                else:
+                    logger.error(
+                        f"Invalid response for greedy_until. Response: {response}"
+                    )
+                    res.append(None)  # Add default value in case of error
+            else:
+                logger.error(f"Invalid response for greedy_until. Response: {response}")
+                res.append(None)  # Add default value in case of error
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GGUF models"
+        )
-- 
GitLab


From 1e1dbaf3545277a3ce0e890e36f4f528f6f3139d Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 6 Nov 2023 09:50:17 +0000
Subject: [PATCH 173/212] remove samples in the table

---
 lm_eval/evaluator.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 76b2c475..14ca4f5a 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -571,6 +571,10 @@ def evaluate(
         _versions = collections.defaultdict(dict)
         for task in results_agg:
             task_results = results_agg[task]
+
+            if "samples" in task_results:
+                task_results.pop("samples")
+
             tab_string = ""
             if "tab" in task_results:
                 tab = task_results.pop("tab")
@@ -589,6 +593,10 @@ def evaluate(
         _groups_agg = collections.defaultdict(dict)
         for group in groups_agg:
             group_results = groups_agg[group]
+
+            if "samples" in group_results:
+                group_results.pop("samples")
+
             tab_string = ""
             if "tab" in group_results:
                 tab = group_results.pop("tab")
-- 
GitLab


From 491d479930824577922741330197321fc88cdbb9 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 6 Nov 2023 09:56:20 +0000
Subject: [PATCH 174/212] reformat

---
 docs/new_task_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 6b30bfc9..86966be5 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -50,7 +50,7 @@ dataset_kwargs: null # any extra keyword arguments that should be passed to the
 ```
 dataset_path: json
 dataset_name: null
-dataset_kwargs: 
+dataset_kwargs:
   data_files: /path/to/my/json
 ```
 -------------------------------
-- 
GitLab


From 44124d95a25195da0a3d129dddabb37c43ba5ce2 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Mon, 6 Nov 2023 13:08:33 +0000
Subject: [PATCH 175/212] add space after -

---
 lm_eval/evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 14ca4f5a..6b951ef2 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -578,7 +578,7 @@ def evaluate(
             tab_string = ""
             if "tab" in task_results:
                 tab = task_results.pop("tab")
-                tab_string = " " * tab + "-" if tab > 0 else ""
+                tab_string = " " * tab + "- " if tab > 0 else ""
 
             if task in task_group_alias:
                 task_alias = task_group_alias[task]
@@ -600,7 +600,7 @@ def evaluate(
             tab_string = ""
             if "tab" in group_results:
                 tab = group_results.pop("tab")
-                tab_string = " " * tab + "-" if tab > 0 else ""
+                tab_string = " " * tab + "- " if tab > 0 else ""
 
             if group in task_group_alias:
                 group_alias = task_group_alias[group]
-- 
GitLab


From 8bf55a204d198f56228e572a74d9ef39a9fa8c74 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 7 Nov 2023 06:53:28 +0000
Subject: [PATCH 176/212] add squad from master

---
 lm_eval/api/task.py                        | 129 ++++++++++---
 lm_eval/evaluator.py                       |   9 +-
 lm_eval/tasks/__init__.py                  |   2 +
 lm_eval/tasks/squad.py                     | 211 +++++++++++++++++++++
 lm_eval/tasks/squadv2/default.yaml         |   2 +-
 lm_eval/tasks/squadv2/with_noans_prob.yaml |   2 +-
 6 files changed, 318 insertions(+), 37 deletions(-)
 create mode 100644 lm_eval/tasks/squad.py

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 406cff4d..4dc79888 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -91,7 +91,7 @@ class TaskConfig(dict):
     metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
 
     def __post_init__(self) -> None:
-        if "." in self.dataset_path:
+        if self.dataset_path and ("." in self.dataset_path):
             import inspect
             from importlib import import_module
 
@@ -204,19 +204,19 @@ class Task(abc.ABC):
         self._fewshot_docs = None
         self._instances = None
 
-        self._config = TaskConfig(**config) if config else TaskConfig()
+        self._config = (
+            TaskConfig(
+                {
+                    **config,
+                    **{"dataset_path": DATASET_PATH, "dataset_name": DATASET_NAME},
+                }
+            )
+            if config
+            else TaskConfig()
+        )
 
-        if not hasattr(self, "_filters"):
-            self._filters = []
-            for name, components in self._config.get(
-                "filters", [["none", [["take_first", None]]]]
-            ):
-                filter_pipeline = build_filter_ensemble(name, components)
-                self._filters.append(filter_pipeline)
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
 
-        self.sampler = samplers.Sampler(
-            list(self.fewshot_docs()), self, rnd=random.Random(1234)
-        )
 
     def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
         """Downloads and returns the task dataset.
@@ -358,7 +358,7 @@ class Task(abc.ABC):
             ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
         eval_logger.info(
-            f"Building contexts for task '{self.config.task}' on rank {rank}..."
+            f"Building contexts for task on rank {rank}..."
         )
 
         instances = []
@@ -449,7 +449,9 @@ class Task(abc.ABC):
         return len(re.split(r"\s+", doc))
 
     @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot):
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=random.Random(1234), description=None
+    ):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
 
@@ -457,34 +459,68 @@ class Task(abc.ABC):
             The document as returned from training_docs, validation_docs, or test_docs.
         :param num_fewshot: int
             The number of fewshot examples to provide in the returned context string.
+        :param provide_description: bool
+            Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
         :returns: str
             The fewshot context.
         """
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
+
+        description = description + "\n\n" if description else ""
 
         if num_fewshot == 0:
-            # always prepend the (possibly empty) task description
-            labeled_examples = self.config.description
+            labeled_examples = ""
         else:
-            labeled_examples = self.config.description + self.sampler.get_context(
-                doc, num_fewshot
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
             )
 
         example = self.doc_to_text(doc)
-        if type(example) == str:
-            return labeled_examples + example
-        elif type(example) == list:
-            return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
-            if self.config.doc_to_choice is not None:
-                choices = self.doc_to_choice(doc)
-                return labeled_examples + choices[example]
-            else:
-                return labeled_examples + str(example)
+        return description + labeled_examples + example
 
     def apply_filters(self):
         if hasattr(self, "_filters"):
             for f in self._filters:
-                f.apply(self._instances)
+                f.apply(self._instances, None)
         else:
             eval_logger.warning("No filter defined, passing through instances")
             return self._instances
@@ -764,6 +800,41 @@ class ConfigurableTask(Task):
                 )
             return super().fewshot_docs()
 
+
+    @utils.positional_deprecated
+    def fewshot_context(self, doc, num_fewshot):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :returns: str
+            The fewshot context.
+        """
+
+        if num_fewshot == 0:
+            # always prepend the (possibly empty) task description
+            labeled_examples = self.config.description
+        else:
+            labeled_examples = self.config.description + self.sampler.get_context(
+                doc, num_fewshot
+            )
+
+        example = self.doc_to_text(doc)
+        if type(example) == str:
+            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            if self.config.doc_to_choice is not None:
+                choices = self.doc_to_choice(doc)
+                return labeled_examples + choices[example]
+            else:
+                return labeled_examples + str(example)
+
+
     def apply_filters(self):
         if hasattr(self, "_filters"):
             for f in self._filters:
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 3fa9633e..072664b3 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -268,12 +268,9 @@ def evaluate(
                     eval_logger.info(f"Request: {str(inst)}")
 
         # aggregate Instances by LM method requested to get output.
-        reqtype = (
-            "loglikelihood"
-            if task.OUTPUT_TYPE == "multiple_choice"
-            else task.OUTPUT_TYPE
-        )  # TODO: this is hacky, fix in task.py
-        requests[reqtype].extend(task.instances)
+        for instance in task.instances:
+            reqtype = instance.request_type
+            requests[reqtype].append(instance)
 
         if lm.world_size > 1:
             instances_rnk = torch.tensor(len(task._instances), device=lm.device)
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index d36b6077..ca1ecef8 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,6 +15,8 @@ from lm_eval.api.registry import (
 
 import logging
 
+from .squad import SQuAD2
+
 eval_logger = logging.getLogger("lm-eval")
 
 
diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squad.py
new file mode 100644
index 00000000..638b9ded
--- /dev/null
+++ b/lm_eval/tasks/squad.py
@@ -0,0 +1,211 @@
+import datasets
+
+from math import exp
+from functools import partial
+from packaging import version
+
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+
+def _squad_metric(predictions, references):
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+
+
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+
+    return _squad_metric(predictions=predictions, references=references).get(key, 0)
+
+
+@register_task("squadv2")
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs
+            ),
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " " + "unanswerable"),
+                idx=0,
+                **kwargs
+            )
+        ]
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        
+        continuation, (logprob_unanswerable, _) = results
+
+        no_answer_probability = exp(logprob_unanswerable)
+
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
\ No newline at end of file
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
index 0f42bf54..3ba9e98f 100644
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
@@ -1,5 +1,5 @@
 include: _template_yaml
-task: squadv2
+task: squadv2_generate_until
 output_type: generate_until
 generation_kwargs:
   until:
diff --git a/lm_eval/tasks/squadv2/with_noans_prob.yaml b/lm_eval/tasks/squadv2/with_noans_prob.yaml
index dacfb920..e17b086b 100644
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
@@ -1,4 +1,4 @@
 group: squadv2_complete
 task:
-  - squadv2
+  - squadv2_generate_until
   - squadv2_noans_loglikelihood
-- 
GitLab


From e1d5c849740d3ac52e9a745ba2396306bf0145a0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 7 Nov 2023 07:03:44 +0000
Subject: [PATCH 177/212] moved squad.py and format changes

---
 lm_eval/api/task.py                  | 15 +++++++--------
 lm_eval/tasks/__init__.py            |  2 +-
 lm_eval/tasks/{ => squadv2}/squad.py |  0
 3 files changed, 8 insertions(+), 9 deletions(-)
 rename lm_eval/tasks/{ => squadv2}/squad.py (100%)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 4dc79888..3e34f94c 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -208,7 +208,6 @@ class Task(abc.ABC):
             TaskConfig(
                 {
                     **config,
-                    **{"dataset_path": DATASET_PATH, "dataset_name": DATASET_NAME},
                 }
             )
             if config
@@ -217,7 +216,6 @@ class Task(abc.ABC):
 
         self._filters = [build_filter_ensemble("none", [["take_first", None]])]
 
-
     def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
         """Downloads and returns the task dataset.
         Override this method to download the dataset from a custom API.
@@ -357,9 +355,7 @@ class Task(abc.ABC):
                 False
             ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
-        eval_logger.info(
-            f"Building contexts for task on rank {rank}..."
-        )
+        eval_logger.info(f"Building contexts for task on rank {rank}...")
 
         instances = []
         for doc_id, doc in utils.create_iterator(
@@ -450,7 +446,12 @@ class Task(abc.ABC):
 
     @utils.positional_deprecated
     def fewshot_context(
-        self, doc, num_fewshot, provide_description=None, rnd=random.Random(1234), description=None
+        self,
+        doc,
+        num_fewshot,
+        provide_description=None,
+        rnd=random.Random(1234),
+        description=None,
     ):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -800,7 +801,6 @@ class ConfigurableTask(Task):
                 )
             return super().fewshot_docs()
 
-
     @utils.positional_deprecated
     def fewshot_context(self, doc, num_fewshot):
         """Returns a fewshot context string that is made up of a prepended description
@@ -834,7 +834,6 @@ class ConfigurableTask(Task):
             else:
                 return labeled_examples + str(example)
 
-
     def apply_filters(self):
         if hasattr(self, "_filters"):
             for f in self._filters:
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index ca1ecef8..2d7a5891 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
 
 import logging
 
-from .squad import SQuAD2
+from .squadv2.squad import SQuAD2
 
 eval_logger = logging.getLogger("lm-eval")
 
diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squadv2/squad.py
similarity index 100%
rename from lm_eval/tasks/squad.py
rename to lm_eval/tasks/squadv2/squad.py
-- 
GitLab


From 7b60ead42f00458cbcb59391088ebbf83d220ee1 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 8 Nov 2023 03:00:02 +0000
Subject: [PATCH 178/212] added comment line

---
 lm_eval/tasks/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 2d7a5891..faa4eb6e 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,9 +15,11 @@ from lm_eval.api.registry import (
 
 import logging
 
+eval_logger = logging.getLogger("lm-eval")
+
+# import python tasks
 from .squadv2.squad import SQuAD2
 
-eval_logger = logging.getLogger("lm-eval")
 
 
 def register_configurable_task(config: Dict[str, str]) -> int:
-- 
GitLab


From 9817e7c2c925307d28af3c8fec9dea6116d81c87 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 8 Nov 2023 03:04:54 +0000
Subject: [PATCH 179/212] removed yaml version of squad

---
 lm_eval/tasks/squad.py | 211 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 lm_eval/tasks/squad.py

diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squad.py
new file mode 100644
index 00000000..638b9ded
--- /dev/null
+++ b/lm_eval/tasks/squad.py
@@ -0,0 +1,211 @@
+import datasets
+
+from math import exp
+from functools import partial
+from packaging import version
+
+from lm_eval.api.task import Task
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+
+def _squad_metric(predictions, references):
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+
+
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+
+    return _squad_metric(predictions=predictions, references=references).get(key, 0)
+
+
+@register_task("squadv2")
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs
+            ),
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " " + "unanswerable"),
+                idx=0,
+                **kwargs
+            )
+        ]
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        
+        continuation, (logprob_unanswerable, _) = results
+
+        no_answer_probability = exp(logprob_unanswerable)
+
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }
\ No newline at end of file
-- 
GitLab


From d3f429ac175b87c1f7a83908c3de8f39745c737c Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 8 Nov 2023 03:16:18 +0000
Subject: [PATCH 180/212] removed yaml version of squad

---
 lm_eval/tasks/__init__.py                  |   2 +-
 lm_eval/tasks/squadv2/README.md            |  55 ------
 lm_eval/tasks/squadv2/_template_yaml       |   8 -
 lm_eval/tasks/squadv2/default.yaml         |  13 --
 lm_eval/tasks/squadv2/no_ans.yaml          |   6 -
 lm_eval/tasks/squadv2/squad.py             | 211 ---------------------
 lm_eval/tasks/squadv2/utils.py             |  51 -----
 lm_eval/tasks/squadv2/with_noans_prob.yaml |   4 -
 8 files changed, 1 insertion(+), 349 deletions(-)
 delete mode 100644 lm_eval/tasks/squadv2/README.md
 delete mode 100644 lm_eval/tasks/squadv2/_template_yaml
 delete mode 100644 lm_eval/tasks/squadv2/default.yaml
 delete mode 100644 lm_eval/tasks/squadv2/no_ans.yaml
 delete mode 100644 lm_eval/tasks/squadv2/squad.py
 delete mode 100644 lm_eval/tasks/squadv2/utils.py
 delete mode 100644 lm_eval/tasks/squadv2/with_noans_prob.yaml

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index faa4eb6e..dab8a7ed 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -18,7 +18,7 @@ import logging
 eval_logger = logging.getLogger("lm-eval")
 
 # import python tasks
-from .squadv2.squad import SQuAD2
+from .squad import SQuAD2
 
 
diff --git a/lm_eval/tasks/squadv2/README.md b/lm_eval/tasks/squadv2/README.md
deleted file mode 100644
index f29ad171..00000000
--- a/lm_eval/tasks/squadv2/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Task-name
-
-### Paper
-
-Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
-Abstract: https://arxiv.org/abs/1806.03822
-
-Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
-consisting of questions posed by crowdworkers on a set of Wikipedia articles,
-where the answer to every question is a segment of text, or span, from the
-corresponding reading passage, or the question might be unanswerable.
-SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
-questions written adversarially by crowdworkers to look similar to answerable ones.
-To do well on SQuAD2.0, systems must not only answer questions when possible, but
-also determine when no answer is supported by the paragraph and abstain from answering.
-
-Homepage: https://rajpurkar.github.io/SQuAD-explorer/
-
-
-### Citation
-
-```
-@misc{rajpurkar2018know,
-    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
-    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
-    year={2018},
-    eprint={1806.03822},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-```
-
-### Groups and Tasks
-
-#### Groups
-
-* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
-
-#### Tasks
-
-* `squadv2`: `Default squadv2 task`
-* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
-
-### Checklist
-
-For adding novel benchmarks/datasets to the library:
-* [ ] Is the task an existing benchmark in the literature?
-  * [ ] Have you referenced the original paper that introduced the task?
-  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
-
-
-If other tasks on this dataset are already supported:
-* [ ] Is the "Main" variant of this task clearly denoted?
-* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
-* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/squadv2/_template_yaml b/lm_eval/tasks/squadv2/_template_yaml
deleted file mode 100644
index 05bb724a..00000000
--- a/lm_eval/tasks/squadv2/_template_yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-dataset_path: squad_v2
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
deleted file mode 100644
index 3ba9e98f..00000000
--- a/lm_eval/tasks/squadv2/default.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-include: _template_yaml
-task: squadv2_generate_until
-output_type: generate_until
-generation_kwargs:
-  until:
-    - "\n"
-metric_list:
-  - metric: !function utils.exact
-    aggregation: mean
-    higher_is_better: true
-  - metric: !function utils.f1
-    aggregation: mean
-    higher_is_better: true
diff --git a/lm_eval/tasks/squadv2/no_ans.yaml b/lm_eval/tasks/squadv2/no_ans.yaml
deleted file mode 100644
index 7b0a47c7..00000000
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-include: _template_yaml
-task: squadv2_noans_loglikelihood
-output_type: loglikelihood
-doc_to_target: " unanswerable"
-metric_list:
-  - metric: perplexity
diff --git a/lm_eval/tasks/squadv2/squad.py b/lm_eval/tasks/squadv2/squad.py
deleted file mode 100644
index 638b9ded..00000000
--- a/lm_eval/tasks/squadv2/squad.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import datasets
-
-from math import exp
-from functools import partial
-from packaging import version
-
-from lm_eval.api.task import Task
-from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_task
-
-def _squad_metric(predictions, references):
-    squad_metric = datasets.load_metric("squad_v2")
-    return squad_metric.compute(predictions=predictions, references=references)
-
-
-def _squad_agg(key, items):
-    predictions, references = zip(*items)
-
-    return _squad_metric(predictions=predictions, references=references).get(key, 0)
-
-
-@register_task("squadv2")
-class SQuAD2(Task):
-    VERSION = 1
-    DATASET_PATH = "squad_v2"
-    DATASET_NAME = None
-
-    # HF changed squad on us so we have to make sure we aren't running the old one
-    assert version.parse(datasets.__version__) >= version.parse(
-        "1.11.0"
-    ), "datasets v1.11.0 or later required for SQuAD"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        return self.dataset["train"]
-
-    def validation_docs(self):
-        return self.dataset["validation"]
-
-    def doc_to_text(self, doc):
-        return (
-            "Title: "
-            + doc["title"]
-            + "\n\n"
-            + "Background: "
-            + doc["context"]
-            + "\n\n"
-            + "Question: "
-            + doc["question"]
-            + "\n\n"
-            + "Answer:"
-        )
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["context"]
-
-    def doc_to_target(self, doc):
-        answer_list = doc["answers"]["text"]
-        if len(answer_list) > 0:
-            answer = answer_list[0]
-        else:
-            answer = "unanswerable"
-        return " " + answer
-
-    def construct_requests(self, doc, ctx, **kwargs):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-
-        return [
-            Instance(
-                request_type="generate_until",
-                doc=doc,
-                arguments=(ctx, {"until": ["\n"]}),
-                idx=0,
-                **kwargs
-            ),
-            Instance(
-                request_type="loglikelihood",
-                doc=doc,
-                arguments=(ctx, " " + "unanswerable"),
-                idx=0,
-                **kwargs
-            )
-        ]
-
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        
-        continuation, (logprob_unanswerable, _) = results
-
-        no_answer_probability = exp(logprob_unanswerable)
-
-        predictions = {
-            "id": doc["id"],
-            "prediction_text": continuation,
-            "no_answer_probability": no_answer_probability,
-        }
-
-        references = {
-            "id": doc["id"],
-            "answers": doc["answers"],
-        }
-
-        return {
-            "exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "best_exact": (
-                predictions,
-                references,
-            ),  # Best exact match (with varying threshold)
-            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
-        }
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {
-            "exact": partial(
-                _squad_agg, "exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": partial(
-                _squad_agg, "f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": partial(
-                _squad_agg, "HasAns_exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": partial(
-                _squad_agg, "HasAns_f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": partial(
-                _squad_agg, "NoAns_exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": partial(
-                _squad_agg, "NoAns_f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "best_exact": partial(
-                _squad_agg, "best_exact"
-            ),  # Best exact match (with varying threshold)
-            "best_f1": partial(
-                _squad_agg, "best_f1"
-            ),  # Best F1 (with varying threshold)
-        }
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {
-            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": True,  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
-            "best_exact": True,  # Best exact match (with varying threshold)
-            "best_f1": True,  # Best F1 (with varying threshold)
-        }
\ No newline at end of file
diff --git a/lm_eval/tasks/squadv2/utils.py b/lm_eval/tasks/squadv2/utils.py
deleted file mode 100644
index 8c788cc0..00000000
--- a/lm_eval/tasks/squadv2/utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import re
-import string
-import collections
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-# Exact match (the normalized answer exactly match the gold answer)
-def exact(predictions, references):
-    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
-
-
-# The F-score of predicted tokens versus the gold answer
-def f1(predictions, references):
-    gold_toks = get_tokens(references[0])
-    pred_toks = get_tokens(predictions[0])
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
diff --git a/lm_eval/tasks/squadv2/with_noans_prob.yaml b/lm_eval/tasks/squadv2/with_noans_prob.yaml
deleted file mode 100644
index e17b086b..00000000
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-group: squadv2_complete
-task:
-  - squadv2_generate_until
-  - squadv2_noans_loglikelihood
-- 
GitLab


From 23bb863605e311810fd3e2bbc77cf29589a24093 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 09:51:19 +0000
Subject: [PATCH 181/212] add description on task/group alias

---
 docs/new_task_guide.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 86966be5..b345f9a7 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -273,6 +273,23 @@ to the top of any Python file that is run or imported when performing evaluation
 Passing `--tasks /path/to/yaml/file` is also accepted.
 
 
+## Beautifying Table Display
+
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed.
+
+for example in `mmlu_abstract_algebra.yaml`
+```
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
+```
+
+
 ## Checking validity
 
 After registering your task, you can now check on your data downloading and verify that the few-shot samples look as intended. Run the following command with your desired args:
-- 
GitLab


From e376ad8a8a96d9bdaaed715612de24605a854714 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 13:47:40 +0000
Subject: [PATCH 182/212] update

---
 docs/new_task_guide.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index b345f9a7..9debb8ef 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -276,8 +276,9 @@ Passing `--tasks /path/to/yaml/file` is also accepted.
 ## Beautifying Table Display
 
 To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed.
+``
+for example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
 
-for example in `mmlu_abstract_algebra.yaml`
 ```
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
@@ -288,7 +289,7 @@ for example in `mmlu_abstract_algebra.yaml`
 "task": "mmlu_abstract_algebra"
 "task_alias": "abstract_algebra"
 ```
-
+Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.
 
 ## Checking validity
 
-- 
GitLab


From a35206191acac1776761e737b66e0d04975d21b9 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 14:18:42 +0000
Subject: [PATCH 183/212] readded task descriptions

---
 lm_eval/tasks/squad.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squad.py
index 638b9ded..5ff66315 100644
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -1,3 +1,18 @@
+"""
+Know What You Don’t Know: Unanswerable Questions for SQuAD
+https://arxiv.org/pdf/1806.03822.pdf
+
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+"""
 import datasets
 
 from math import exp
@@ -8,6 +23,17 @@ from lm_eval.api.task import Task
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_task
 
+_CITATION = """
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
 def _squad_metric(predictions, references):
     squad_metric = datasets.load_metric("squad_v2")
     return squad_metric.compute(predictions=predictions, references=references)
-- 
GitLab


From 7513783645d1b3a3321b6f8eba43219aaa35b99a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 14:23:07 +0000
Subject: [PATCH 184/212] add scrolls

---
 lm_eval/tasks/scrolls.py | 490 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 490 insertions(+)
 create mode 100644 lm_eval/tasks/scrolls.py

diff --git a/lm_eval/tasks/scrolls.py b/lm_eval/tasks/scrolls.py
new file mode 100644
index 00000000..5aece9cc
--- /dev/null
+++ b/lm_eval/tasks/scrolls.py
@@ -0,0 +1,490 @@
+"""
+SCROLLS: Standardized CompaRison Over Long Language Sequences
+https://arxiv.org/abs/2201.03533
+
+SCROLLS is a suite of datasets that require synthesizing information over long texts.
+The benchmark includes seven natural language tasks across multiple domains,
+including summarization, question answering, and natural language inference.
+
+Homepage: https://www.scrolls-benchmark.com/
+
+Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
+it is possible to create "subset" tasks that contain only those samples whose tokenized length
+is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
+be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
+
+```
+class QasperGPTNeoX4K(Qasper):
+    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
+    PRUNE_MAX_TOKENS = 4096
+    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
+```
+
+`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
+less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
+that use different tokenizers but the same maximum sequence length.
+
+Once the subset task class has been defined in this file, it can be used by adding the class
+to `lm_eval/tasks/__init__.py`.
+
+NOTE: GovReport may need `max_gen_toks` set larger for causal models.
+"""
+import re
+import numpy as np
+import transformers.data.metrics.squad_metrics as squad_metrics
+
+from abc import abstractmethod
+from datasets import load_metric
+from transformers import AutoTokenizer
+from functools import reduce
+
+from lm_eval.api.task import Task
+from lm_eval.api.metrics import mean
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_task
+
+_CITATION = """
+@inproceedings{shaham-etal-2022-scrolls,
+    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
+    author = "Shaham, Uri  and
+      Segal, Elad  and
+      Ivgi, Maor  and
+      Efrat, Avia  and
+      Yoran, Ori  and
+      Haviv, Adi  and
+      Gupta, Ankit  and
+      Xiong, Wenhan  and
+      Geva, Mor  and
+      Berant, Jonathan  and
+      Levy, Omer",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.823",
+    pages = "12007--12021"
+}
+"""
+
+# SCROLLS is formualted as a sequence-to-sequence task.
+# To allow for evaluation of causal models, we'll
+# reformualte these with appropriate prompts
+
+
+def _download_metric():
+    import os
+    import shutil
+    from huggingface_hub import hf_hub_download
+
+    scrolls_metric_path = hf_hub_download(
+        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+    )
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path)
+        + os.path.basename(scrolls_metric_path).replace(".", "_")
+        + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+
+
+def _process_doc_prepended_question(doc):
+    # "When a query is given in addition to the raw text (as
+    # in QMSum, Qasper, NarrativeQA, QuALITY, and ContractNLI),
+    # we prepend it to the text, using two newlines as a natural separator"
+    input = doc["input"]
+    split = input.find("\n\n")
+    return {
+        "id": doc["id"],
+        "pid": doc["pid"],
+        "input": input,
+        "outputs": doc["outputs"],
+        "question": input[0:split],
+        "text": input[split + 2 :],
+    }
+
+
+def _drop_duplicates_in_input(untokenized_dataset):
+    # from scrolls/evaluator/dataset_evaluator.py
+
+    indices_to_keep = []
+    id_to_idx = {}
+    outputs = []
+    for i, (id_, output) in enumerate(
+        zip(untokenized_dataset["id"], untokenized_dataset["output"])
+    ):
+        if id_ in id_to_idx:
+            outputs[id_to_idx[id_]].append(output)
+            continue
+        indices_to_keep.append(i)
+        id_to_idx[id_] = len(outputs)
+        outputs.append([output])
+    untokenized_dataset = untokenized_dataset.select(indices_to_keep).flatten_indices()
+    untokenized_dataset = untokenized_dataset.remove_columns("output")
+    untokenized_dataset = untokenized_dataset.add_column("outputs", outputs)
+    return untokenized_dataset
+
+
+def _num_cpu_cores():
+    # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
+    try:
+        import psutil
+
+        return psutil.cpu_count(logical=False)
+    except ImportError:
+        import os
+
+        return len(os.sched_getaffinity(0))
+
+
+class _SCROLLSTask(Task):
+    VERSION = 0
+    DATASET_PATH = "tau/scrolls"
+    DATASET_NAME = None
+    PRUNE_TOKENIZERS = None
+    PRUNE_MAX_TOKENS = None
+    PRUNE_NUM_PROC = None
+
+    def __init__(self, no_metric=False):
+        super().__init__()
+        self.metric = (
+            load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            if not no_metric
+            else None
+        )
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        for doc in self.dataset["train"]:
+            yield from self._process_doc(doc)
+
+    def validation_docs(self):
+        for doc in self.dataset["validation"]:
+            yield from self._process_doc(doc)
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["input"]
+
+    def download(self, *args, **kwargs):
+        super().download(*args, **kwargs)
+        del self.dataset["test"]
+        for split in self.dataset:
+            self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
+        if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None:
+            self.prune()
+
+    def _get_prune_text(self, sample):
+        return self.doc_to_text(self._process_doc(sample)[0])
+
+    def prune(self):
+        """Create a pruned version of a SCROLLS task dataset containing only inputs
+        that are less than `max_tokens` when tokenized by each tokenizer
+        """
+
+        tokenizers = [
+            AutoTokenizer.from_pretrained(tokenizer)
+            for tokenizer in self.PRUNE_TOKENIZERS
+        ]
+        cache = {}
+
+        def _filter(sample):
+            text = self._get_prune_text(sample)
+            cached = cache.get(text, None)
+            if cached is None:
+                for tokenizer in tokenizers:
+                    if len(tokenizer(text).input_ids) > self.PRUNE_MAX_TOKENS:
+                        cache[text] = False
+                        return False
+                cache[text] = True
+                return True
+            else:
+                return cached
+
+        self.dataset = self.dataset.filter(_filter, num_proc=self.PRUNE_NUM_PROC)
+
+    def doc_to_target(self, doc):
+        return " " + ", ".join(doc["outputs"])
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
+
+    def higher_is_better(self):
+        return {x: True for x in self._scrolls_metrics().keys()}
+
+    @abstractmethod
+    def _scrolls_metrics(self):
+        pass
+
+    def _make_compute_metrics(self, value):
+        def compute_metrics(samples):
+            predictions, references = zip(*samples)  # unzip, if you will
+            computed = self.metric.compute(
+                predictions=predictions, references=references
+            )
+            return computed[value]
+
+        return compute_metrics
+
+    def aggregation(self):
+        return {
+            key: self._make_compute_metrics(value)
+            for key, value in self._scrolls_metrics().items()
+        }
+
+
+class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
+    def __init__(self):
+        super().__init__(no_metric=True)
+
+    def _scrolls_metrics(self):
+        return None
+
+    def aggregation(self):
+        return {"em": mean, "acc": mean, "acc_norm": mean}
+
+    def higher_is_better(self):
+        return {"em": True, "acc": True, "acc_norm": True}
+
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "em": acc_norm * 100.0,
+        }
+
+    def construct_requests(self, doc, ctx):
+
+        request_list = [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+            )
+            for i, choice in doc["choices"]
+        ]
+        return request_list
+
+
+class _SCROLLSSummaryTask(_SCROLLSTask):
+    def _process_doc(self, doc):
+        return [doc]
+
+    def _scrolls_metrics(self):
+        return {
+            "rouge1": "rouge/rouge1",
+            "rouge2": "rouge/rouge2",
+            "rougeL": "rouge/rougeL",
+        }
+
+    def process_results(self, doc, results):
+        return {
+            "rouge1": (results[0], doc["outputs"]),
+            "rouge2": (results[0], doc["outputs"]),
+            "rougeL": (results[0], doc["outputs"]),
+        }
+
+    def construct_requests(self, doc, ctx):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+        )
+
+    def doc_to_text(self, doc):
+        return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
+
+
+@register_task("scrolls_qasper")
+class Qasper(_SCROLLSTask):
+    """A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
+    https://arxiv.org/abs/2105.03011
+    """
+
+    DATASET_NAME = "qasper"
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["is_yes_no"] = reduce(
+            lambda prev, cur: prev
+            and squad_metrics.normalize_answer(cur) in ["yes", "no"],
+            doc["outputs"],
+            True,
+        )
+        return [doc]
+
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+
+    def process_results(self, doc, results):
+        if doc["is_yes_no"]:
+            prediction = " yes" if results[0] > results[1] else " no"
+        elif len(results[0].strip()) == 0:
+            prediction = "Unanswerable"
+        else:
+            prediction = results[0]
+        return {"f1": (prediction, doc["outputs"])}
+
+    def construct_requests(self, doc, ctx):
+        if doc["is_yes_no"]:
+            return [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " yes"),
+                    idx=0,
+                ),
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=(ctx, " no"),
+                    idx=1,
+                ),
+            ]
+        else:
+            return Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+            )
+
+
+@register_task("scrolls_quality")
+class QuALITY(_SCROLLSMultipleChoiceTask):
+    """QuALITY: Question Answering with Long Input Texts, Yes!
+    https://arxiv.org/abs/2112.08608
+    """
+
+    DATASET_NAME = "quality"
+    _multiple_choice_pattern = re.compile(r" *\([A-D]\) *")
+
+    @staticmethod
+    def _normalize_answer(text):
+        return " ".join(text.split()).strip()
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+
+        split = doc["text"].find("\n\n", doc["text"].find("(D)"))
+        choices_text = doc["text"][:split]
+
+        doc["text"] = doc["text"][split:].strip()
+        doc["choices"] = [
+            QuALITY._normalize_answer(choice)
+            for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
+        ]
+        doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
+
+        return [doc]
+
+
+@register_task("scrolls_narrativeqa")
+class NarrativeQA(_SCROLLSTask):
+    """The NarrativeQA Reading Comprehension Challenge
+    https://arxiv.org/abs/1712.07040
+    """
+
+    DATASET_NAME = "narrative_qa"
+
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+
+    def _scrolls_metrics(self):
+        return {"f1": "f1"}
+
+    def _get_prune_text(self, doc):
+        # pruning narrativeqa takes forever -- let's cheat a bit
+        # and just cache on the text, not the question, since
+        # the dataset is different questions about the same large
+        # documents
+        return self._process_doc(doc)[0]["text"]
+
+    def process_results(self, doc, results):
+        return {"f1": (results[0], doc["outputs"])}
+
+    def construct_requests(self, doc, ctx):
+        return Instance(
+            request_type="generate_until",
+            doc=doc,
+            arguments=(ctx, {"until": ["\n"]}),
+            idx=0,
+        )
+
+
+@register_task("scrolls_contractnli")
+class ContractNLI(_SCROLLSMultipleChoiceTask):
+    """ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
+    https://arxiv.org/abs/1712.07040
+    """
+
+    DATASET_NAME = "contract_nli"
+    CHOICES = ["Not mentioned", "Entailment", "Contradiction"]
+
+    def _process_doc(self, doc):
+        doc = _process_doc_prepended_question(doc)
+        doc["choices"] = ContractNLI.CHOICES
+        doc["gold"] = ContractNLI.CHOICES.index(doc["outputs"][0])
+        return [doc]
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"
+
+
+@register_task("scrolls_govreport")
+class GovReport(_SCROLLSSummaryTask):
+    """Efficient Attentions for Long Document Summarization
+    https://arxiv.org/abs/2104.02112
+
+    Note: The average length of the reference summaries is ~3,000
+    characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
+    it is recommended to set `max_gen_toks` sufficently large (e.g. 1024)
+    to allow a full summary to be generated.
+    """
+
+    DATASET_NAME = "gov_report"
+
+
+@register_task("scrolls_summscreenfd")
+class SummScreenFD(_SCROLLSSummaryTask):
+    """SummScreen: A Dataset for Abstractive Screenplay Summarization
+    https://arxiv.org/abs/2104.07091
+    """
+
+    DATASET_NAME = "summ_screen_fd"
+
+
+@register_task("scrolls_qmsum")
+class QMSum(_SCROLLSSummaryTask):
+    """QMSum: A New Benchmark for Query-based Multi-domain
+    Meeting Summarization
+
+    https://arxiv.org/abs/2104.05938
+    """
+
+    DATASET_NAME = "qmsum"
+
+    def _process_doc(self, doc):
+        return [_process_doc_prepended_question(doc)]
+
+    def doc_to_text(self, doc):
+        return f"{doc['text']}\n\nQuestion: {doc['question']}\nAnswer:"
-- 
GitLab


From 9a64e6423d1bb4b10035a86cf0aa55749a7b5025 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 14:23:38 +0000
Subject: [PATCH 185/212] reformat

---
 lm_eval/api/task.py       | 10 +---------
 lm_eval/tasks/__init__.py |  4 ++--
 lm_eval/tasks/squad.py    |  7 ++++---
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 3e34f94c..9fc33fcc 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -204,15 +204,7 @@ class Task(abc.ABC):
         self._fewshot_docs = None
         self._instances = None
 
-        self._config = (
-            TaskConfig(
-                {
-                    **config,
-                }
-            )
-            if config
-            else TaskConfig()
-        )
+        self._config = TaskConfig({**config}) if config else TaskConfig()
 
         self._filters = [build_filter_ensemble("none", [["take_first", None]])]
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index dab8a7ed..8fd09c2b 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,11 +15,11 @@ from lm_eval.api.registry import (
 
 import logging
 
-eval_logger = logging.getLogger("lm-eval")
-
 # import python tasks
 from .squad import SQuAD2
+from .scrolls import QuALITY, NarrativeQA, ContractNLI, GovReport, SummScreenFD, QMSum
 
+eval_logger = logging.getLogger("lm-eval")
 
 
 def register_configurable_task(config: Dict[str, str]) -> int:
diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squad.py
index 5ff66315..84abafc8 100644
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -34,6 +34,7 @@ _CITATION = """
 }
 """
 
+
 def _squad_metric(predictions, references):
     squad_metric = datasets.load_metric("squad_v2")
     return squad_metric.compute(predictions=predictions, references=references)
@@ -125,7 +126,7 @@ class SQuAD2(Task):
                 arguments=(ctx, " " + "unanswerable"),
                 idx=0,
                 **kwargs
-            )
+            ),
         ]
 
     def process_results(self, doc, results):
@@ -138,7 +139,7 @@ class SQuAD2(Task):
         :param results:
             The results of the requests created in construct_requests.
         """
-        
+
         continuation, (logprob_unanswerable, _) = results
 
         no_answer_probability = exp(logprob_unanswerable)
@@ -234,4 +235,4 @@ class SQuAD2(Task):
             "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
             "best_exact": True,  # Best exact match (with varying threshold)
             "best_f1": True,  # Best F1 (with varying threshold)
-        }
\ No newline at end of file
+        }
-- 
GitLab


From 6c812bb54749f1627d4c681a5ff99040891ea090 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 14:24:02 +0000
Subject: [PATCH 186/212] reformat

---
 lm_eval/tasks/__init__.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 8fd09c2b..d373e9c0 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,7 +17,14 @@ import logging
 
 # import python tasks
 from .squad import SQuAD2
-from .scrolls import QuALITY, NarrativeQA, ContractNLI, GovReport, SummScreenFD, QMSum
+from .scrolls import (
+    QuALITY,
+    NarrativeQA,
+    ContractNLI,
+    GovReport,
+    SummScreenFD,
+    QMSum
+)
 
 eval_logger = logging.getLogger("lm-eval")
 
-- 
GitLab


From 097c92537308bdc14fa86192e8959f5d89b02975 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 14:24:22 +0000
Subject: [PATCH 187/212] reformat

---
 lm_eval/tasks/__init__.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index d373e9c0..8fd09c2b 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,14 +17,7 @@ import logging
 
 # import python tasks
 from .squad import SQuAD2
-from .scrolls import (
-    QuALITY,
-    NarrativeQA,
-    ContractNLI,
-    GovReport,
-    SummScreenFD,
-    QMSum
-)
+from .scrolls import QuALITY, NarrativeQA, ContractNLI, GovReport, SummScreenFD, QMSum
 
 eval_logger = logging.getLogger("lm-eval")
 
-- 
GitLab


From 5175375056f3eebb199ed407fff4a5340f6349d3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Thu, 9 Nov 2023 15:26:23 +0000
Subject: [PATCH 188/212] updates to scrolls

---
 lm_eval/tasks/__init__.py                     |  9 ++-
 lm_eval/tasks/scrolls/README.md               | 31 ++++++++++
 lm_eval/tasks/scrolls/scrolls.yaml            |  9 +++
 lm_eval/tasks/{scrolls.py => scrolls/task.py} | 58 +++++--------------
 4 files changed, 62 insertions(+), 45 deletions(-)
 create mode 100644 lm_eval/tasks/scrolls/README.md
 create mode 100644 lm_eval/tasks/scrolls/scrolls.yaml
 rename lm_eval/tasks/{scrolls.py => scrolls/task.py} (87%)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 8fd09c2b..37c9051e 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,7 +17,14 @@ import logging
 
 # import python tasks
 from .squad import SQuAD2
-from .scrolls import QuALITY, NarrativeQA, ContractNLI, GovReport, SummScreenFD, QMSum
+from .scrolls.task import (
+    QuALITY,
+    NarrativeQA,
+    ContractNLI,
+    GovReport,
+    SummScreenFD,
+    QMSum,
+)
 
 eval_logger = logging.getLogger("lm-eval")
 
diff --git a/lm_eval/tasks/scrolls/README.md b/lm_eval/tasks/scrolls/README.md
new file mode 100644
index 00000000..92c84816
--- /dev/null
+++ b/lm_eval/tasks/scrolls/README.md
@@ -0,0 +1,31 @@
+"""
+SCROLLS: Standardized CompaRison Over Long Language Sequences
+https://arxiv.org/abs/2201.03533
+
+SCROLLS is a suite of datasets that require synthesizing information over long texts.
+The benchmark includes seven natural language tasks across multiple domains,
+including summarization, question answering, and natural language inference.
+
+Homepage: https://www.scrolls-benchmark.com/
+
+Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
+it is possible to create "subset" tasks that contain only those samples whose tokenized length
+is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
+be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
+
+```
+class QasperGPTNeoX4K(Qasper):
+    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
+    PRUNE_MAX_TOKENS = 4096
+    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
+```
+
+`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
+less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
+that use different tokenizers but the same maximum sequence length.
+
+Once the subset task class has been defined in this file, it can be used by adding the class
+to `lm_eval/tasks/__init__.py`.
+
+NOTE: GovReport may need `max_gen_toks` set larger for causal models.
+"""
\ No newline at end of file
diff --git a/lm_eval/tasks/scrolls/scrolls.yaml b/lm_eval/tasks/scrolls/scrolls.yaml
new file mode 100644
index 00000000..6dc31557
--- /dev/null
+++ b/lm_eval/tasks/scrolls/scrolls.yaml
@@ -0,0 +1,9 @@
+group: scrolls
+task:
+  - scrolls_qasper
+  - scrolls_quality
+  - scrolls_narrativeqa
+  - scrolls_contractnli
+  - scrolls_govreport
+  - scrolls_summscreenfd
+  - scrolls_qmsum
diff --git a/lm_eval/tasks/scrolls.py b/lm_eval/tasks/scrolls/task.py
similarity index 87%
rename from lm_eval/tasks/scrolls.py
rename to lm_eval/tasks/scrolls/task.py
index 5aece9cc..9b573ed8 100644
--- a/lm_eval/tasks/scrolls.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -1,34 +1,3 @@
-"""
-SCROLLS: Standardized CompaRison Over Long Language Sequences
-https://arxiv.org/abs/2201.03533
-
-SCROLLS is a suite of datasets that require synthesizing information over long texts.
-The benchmark includes seven natural language tasks across multiple domains,
-including summarization, question answering, and natural language inference.
-
-Homepage: https://www.scrolls-benchmark.com/
-
-Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
-it is possible to create "subset" tasks that contain only those samples whose tokenized length
-is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
-be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
-
-```
-class QasperGPTNeoX4K(Qasper):
-    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
-    PRUNE_MAX_TOKENS = 4096
-    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
-```
-
-`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
-less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
-that use different tokenizers but the same maximum sequence length.
-
-Once the subset task class has been defined in this file, it can be used by adding the class
-to `lm_eval/tasks/__init__.py`.
-
-NOTE: GovReport may need `max_gen_toks` set larger for causal models.
-"""
 import re
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
@@ -146,13 +115,8 @@ class _SCROLLSTask(Task):
     PRUNE_MAX_TOKENS = None
     PRUNE_NUM_PROC = None
 
-    def __init__(self, no_metric=False):
-        super().__init__()
-        self.metric = (
-            load_metric(_download_metric(), config_name=self.DATASET_NAME)
-            if not no_metric
-            else None
-        )
+    def __post_init__(self):
+        self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
 
     def has_training_docs(self):
         return True
@@ -245,8 +209,8 @@ class _SCROLLSTask(Task):
 
 
 class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
-    def __init__(self):
-        super().__init__(no_metric=True)
+    def __post_init__(self):
+        self.metric = None
 
     def _scrolls_metrics(self):
         return None
@@ -270,7 +234,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
             "em": acc_norm * 100.0,
         }
 
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
 
         request_list = [
             Instance(
@@ -278,6 +242,7 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
                 doc=doc,
                 arguments=(ctx, " {}".format(choice)),
                 idx=i,
+                **kwargs,
             )
             for i, choice in doc["choices"]
         ]
@@ -302,12 +267,13 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
             "rougeL": (results[0], doc["outputs"]),
         }
 
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
         return Instance(
             request_type="generate_until",
             doc=doc,
             arguments=(ctx, {"until": ["\n"]}),
             idx=0,
+            **kwargs,
         )
 
     def doc_to_text(self, doc):
@@ -344,7 +310,7 @@ class Qasper(_SCROLLSTask):
             prediction = results[0]
         return {"f1": (prediction, doc["outputs"])}
 
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
         if doc["is_yes_no"]:
             return [
                 Instance(
@@ -352,12 +318,14 @@ class Qasper(_SCROLLSTask):
                     doc=doc,
                     arguments=(ctx, " yes"),
                     idx=0,
+                    **kwargs,
                 ),
                 Instance(
                     request_type="loglikelihood",
                     doc=doc,
                     arguments=(ctx, " no"),
                     idx=1,
+                    **kwargs,
                 ),
             ]
         else:
@@ -366,6 +334,7 @@ class Qasper(_SCROLLSTask):
                 doc=doc,
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
+                **kwargs,
             )
 
 
@@ -422,12 +391,13 @@ class NarrativeQA(_SCROLLSTask):
     def process_results(self, doc, results):
         return {"f1": (results[0], doc["outputs"])}
 
-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
         return Instance(
             request_type="generate_until",
             doc=doc,
             arguments=(ctx, {"until": ["\n"]}),
             idx=0,
+            **kwargs,
         )
 
 
-- 
GitLab


From 056c9d8561ff2f2bacf7508c124443f17489008f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 10 Nov 2023 15:09:28 +0000
Subject: [PATCH 189/212] added initialize_task and updated where eval_logger
 is imported from

---
 lm_eval/__main__.py                         | 15 +++++----------
 lm_eval/evaluator.py                        |  3 +--
 lm_eval/models/anthropic_llms.py            |  3 ++-
 lm_eval/models/huggingface.py               |  2 +-
 lm_eval/prompts/__init__.py                 |  2 +-
 lm_eval/tasks/__init__.py                   | 11 +++++++----
 lm_eval/tasks/minerva_math/utils.py         |  2 +-
 lm_eval/tasks/realtoxicityprompts/metric.py |  2 +-
 lm_eval/utils.py                            |  6 ++++++
 9 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 1ece1757..bfcd431e 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -9,12 +9,9 @@ import numpy as np
 from pathlib import Path
 from typing import Union
 
-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-
+from lm_eval import evaluator, utils
+from lm_eval.tasks import initialize_tasks, include_path
+from lm_eval.api.registry import ALL_TASKS
 
 def _handle_non_serializable(o):
     if isinstance(o, np.int64) or isinstance(o, np.int32):
@@ -121,14 +118,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         # we allow for args to be passed externally, else we parse them ourselves
         args = parse_eval_args()
 
-    eval_logger = logging.getLogger("lm-eval")
+    eval_logger = utils.eval_logger
     eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
     eval_logger.info(f"Verbosity set to {args.verbosity}")
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-    from lm_eval import evaluator, utils
-    from lm_eval.tasks import include_path
-    from lm_eval.api.registry import ALL_TASKS
+    initialize_tasks(args.verbosity)
 
     if args.limit:
         eval_logger.warning(
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 9e896686..2ee61765 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -20,10 +20,9 @@ from lm_eval.utils import (
     make_table,
     create_iterator,
     get_git_commit_hash,
+    eval_logger
 )
 
-eval_logger = logging.getLogger("lm-eval")
-
 
 @positional_deprecated
 def simple_evaluate(
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index be144b16..fdff2382 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -2,9 +2,10 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from tqdm import tqdm
 import time
-from lm_eval.logger import eval_logger
+from lm_eval import utils
 from typing import List, Any, Tuple
 
+eval_logger = utils.eval_logger
 
 def anthropic_completion(
     client,  #: anthropic.Anthropic,
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 8feb8cfa..8e6f5338 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -16,7 +16,6 @@ from pathlib import Path
 import torch.nn.functional as F
 
 from lm_eval import utils
-from lm_eval.logger import eval_logger
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
@@ -25,6 +24,7 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
 from accelerate import Accelerator, find_executable_batch_size, DistributedType
 from typing import List, Optional, Union
 
+eval_logger = utils.eval_logger
 
 def _get_accelerate_args(
     device_map_option: Optional[str] = "auto",
diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index 746dc3da..d058a487 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -3,7 +3,7 @@ import ast
 
 from typing import Dict
 from lm_eval import utils
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger
 
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 33727058..e8e40269 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,8 +14,7 @@ from lm_eval.api.registry import (
 )
 
 import logging
-
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = utils.eval_logger
 
 
 def register_configurable_task(config: Dict[str, str]) -> int:
@@ -168,8 +167,12 @@ def include_path(task_dir):
     return 0
 
 
-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_path(task_dir)
+def initialize_tasks(verbosity="INFO"):
+
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+
+    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
+    include_path(task_dir)
 
 
 def get_task(task_name, config):
diff --git a/lm_eval/tasks/minerva_math/utils.py b/lm_eval/tasks/minerva_math/utils.py
index 293fa909..bde5801c 100644
--- a/lm_eval/tasks/minerva_math/utils.py
+++ b/lm_eval/tasks/minerva_math/utils.py
@@ -1,7 +1,7 @@
 import datasets
 import re
 import signal
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger
 from typing import Optional, List, Dict
 
 try:
diff --git a/lm_eval/tasks/realtoxicityprompts/metric.py b/lm_eval/tasks/realtoxicityprompts/metric.py
index 42f678ea..072f561d 100644
--- a/lm_eval/tasks/realtoxicityprompts/metric.py
+++ b/lm_eval/tasks/realtoxicityprompts/metric.py
@@ -3,7 +3,7 @@ import json
 import requests
 import numpy as np
 
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger
 
 
 def toxicity_perspective_api(references, predictions, **kwargs):
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index add1ed66..ccee9521 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -21,8 +21,14 @@ from itertools import islice
 
 import logging
 
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
 eval_logger = logging.getLogger("lm-eval")
 
+SPACING = " " * 47
 
 def escaped_split(text, sep_char, maxsplit=-1):
     """Split text into a list on occurrences of the given separation
-- 
GitLab


From 69186ef227542f1926f673d4b7fa4f03637a567b Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 10 Nov 2023 15:16:00 +0000
Subject: [PATCH 190/212] fixes

---
 lm_eval/tasks/mmlu/default/_default_template_yaml      |  1 -
 .../_mmlu_flan_cot_fewshot_template_yaml               |  1 -
 .../_mmlu_flan_cot_zeroshot_template_yaml              |  1 -
 .../tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml    | 10 +++++-----
 .../_mmlu_flan_loglikelihood_template_yaml             |  1 -
 .../loglikelihood/mmlu_abstract_algebra.yaml           |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml   |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml |  4 ++--
 .../loglikelihood/mmlu_business_ethics.yaml            |  4 ++--
 .../loglikelihood/mmlu_clinical_knowledge.yaml         |  4 ++--
 .../loglikelihood/mmlu_college_biology.yaml            |  4 ++--
 .../loglikelihood/mmlu_college_chemistry.yaml          |  4 ++--
 .../loglikelihood/mmlu_college_computer_science.yaml   |  4 ++--
 .../loglikelihood/mmlu_college_mathematics.yaml        |  4 ++--
 .../loglikelihood/mmlu_college_medicine.yaml           |  4 ++--
 .../loglikelihood/mmlu_college_physics.yaml            |  4 ++--
 .../loglikelihood/mmlu_computer_security.yaml          |  4 ++--
 .../loglikelihood/mmlu_conceptual_physics.yaml         |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_econometrics.yaml   |  4 ++--
 .../loglikelihood/mmlu_electrical_engineering.yaml     |  4 ++--
 .../loglikelihood/mmlu_elementary_mathematics.yaml     |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_formal_logic.yaml   |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_global_facts.yaml   |  4 ++--
 .../loglikelihood/mmlu_high_school_biology.yaml        |  4 ++--
 .../loglikelihood/mmlu_high_school_chemistry.yaml      |  4 ++--
 .../mmlu_high_school_computer_science.yaml             |  4 ++--
 .../mmlu_high_school_european_history.yaml             |  4 ++--
 .../loglikelihood/mmlu_high_school_geography.yaml      |  4 ++--
 .../mmlu_high_school_government_and_politics.yaml      |  4 ++--
 .../loglikelihood/mmlu_high_school_macroeconomics.yaml |  4 ++--
 .../loglikelihood/mmlu_high_school_mathematics.yaml    |  4 ++--
 .../loglikelihood/mmlu_high_school_microeconomics.yaml |  4 ++--
 .../loglikelihood/mmlu_high_school_physics.yaml        |  4 ++--
 .../loglikelihood/mmlu_high_school_psychology.yaml     |  4 ++--
 .../loglikelihood/mmlu_high_school_statistics.yaml     |  4 ++--
 .../loglikelihood/mmlu_high_school_us_history.yaml     |  4 ++--
 .../loglikelihood/mmlu_high_school_world_history.yaml  |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_human_aging.yaml    |  4 ++--
 .../loglikelihood/mmlu_human_sexuality.yaml            |  4 ++--
 .../loglikelihood/mmlu_international_law.yaml          |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml  |  4 ++--
 .../loglikelihood/mmlu_logical_fallacies.yaml          |  4 ++--
 .../loglikelihood/mmlu_machine_learning.yaml           |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_management.yaml     |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml |  4 ++--
 .../loglikelihood/mmlu_medical_genetics.yaml           |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml  |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml |  4 ++--
 .../loglikelihood/mmlu_moral_scenarios.yaml            |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_philosophy.yaml     |  4 ++--
 .../flan_n_shot/loglikelihood/mmlu_prehistory.yaml     |  4 ++--
 .../loglikelihood/mmlu_professional_accounting.yaml    |  4 ++--
 .../loglikelihood/mmlu_professional_law.yaml           |  4 ++--
 .../loglikelihood/mmlu_professional_medicine.yaml      |  4 ++--
 .../loglikelihood/mmlu_professional_psychology.yaml    |  4 ++--
 .../loglikelihood/mmlu_public_relations.yaml           |  4 ++--
 .../loglikelihood/mmlu_security_studies.yaml           |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml |  4 ++--
 .../loglikelihood/mmlu_us_foreign_policy.yaml          |  4 ++--
 .../mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml  |  4 ++--
 .../loglikelihood/mmlu_world_religions.yaml            |  4 ++--
 62 files changed, 119 insertions(+), 123 deletions(-)

diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index f4a3b079..2ff0c566 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index e340271a..2cdaae2f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu_flan_cot_fewshot
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
index 1e276204..9fcb5a3d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu_flan_cot_zeroshot
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
index 7705a171..43c5f007 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
@@ -1,6 +1,6 @@
-group: mmlu_flan_n_shot_generative
+group: mmlu_flan_n_shot_loglikelihood
 task:
-  - mmlu_flan_n_shot_generative_stem
-  - mmlu_flan_n_shot_generative_other
-  - mmlu_flan_n_shot_generative_social_sciences
-  - mmlu_flan_n_shot_generative_humanities
+  - mmlu_flan_n_shot_loglikelihood_stem
+  - mmlu_flan_n_shot_loglikelihood_other
+  - mmlu_flan_n_shot_loglikelihood_social_sciences
+  - mmlu_flan_n_shot_loglikelihood_humanities
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index 3efc2e42..8d964bf8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu_flan_n_shot_loglikelihood
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
index 7ac6123b..068898e4 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_abstract_algebra"
+"task": "mmlu_flan_n_shot_loglikelihood_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
index 2790a593..db5fa24e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_anatomy"
+"task": "mmlu_flan_n_shot_loglikelihood_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
index 199e9560..5f71dbcf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_astronomy"
+"task": "mmlu_flan_n_shot_loglikelihood_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
index 4a346cd5..54dc204d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_business_ethics"
+"task": "mmlu_flan_n_shot_loglikelihood_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
index 8e27f055..121b3c22 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
+"task": "mmlu_flan_n_shot_loglikelihood_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
index 91a91c67..cadb6fb7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_biology"
+"task": "mmlu_flan_n_shot_loglikelihood_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
index 8d3ddf27..8989ed40 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_chemistry"
+"task": "mmlu_flan_n_shot_loglikelihood_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
index 1a37e75a..e4bdbdd6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_computer_science"
+"task": "mmlu_flan_n_shot_loglikelihood_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
index 6ef3d578..17d0cb54 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_mathematics"
+"task": "mmlu_flan_n_shot_loglikelihood_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
index 2bd3c63e..c45a6c9c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_medicine"
+"task": "mmlu_flan_n_shot_loglikelihood_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
index 174a4eee..d325f979 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_physics"
+"task": "mmlu_flan_n_shot_loglikelihood_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
index b5eed81a..5b0a75ff 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_computer_security"
+"task": "mmlu_flan_n_shot_loglikelihood_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
index c165c498..94b7eaf6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_conceptual_physics"
+"task": "mmlu_flan_n_shot_loglikelihood_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
index 94ca68fe..146d4847 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_econometrics"
+"task": "mmlu_flan_n_shot_loglikelihood_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
index 7f72ffca..61cb27e2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_electrical_engineering"
+"task": "mmlu_flan_n_shot_loglikelihood_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
index 091c7a90..39e10f85 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
+"task": "mmlu_flan_n_shot_loglikelihood_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
index 64a3d11d..7fb8aa92 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_formal_logic"
+"task": "mmlu_flan_n_shot_loglikelihood_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
index 1ec7cc2c..5ffc9069 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_global_facts"
+"task": "mmlu_flan_n_shot_loglikelihood_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
index 2b2e15a0..328b47f8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_biology"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
index 549aea5f..35058375 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
index bdbcfe93..cd2e1285 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
index 855db984..62f9465f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_european_history"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
index 6744db9f..c2e8d83f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_geography"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
index c51d372f..9b72fb19 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
index d0bf0220..abc4f87a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
index 958ab60b..1ddd6df3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
index 8eaf6059..d72fc2ac 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
index 208bf5b9..c09b2c1d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_physics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
index c11af0a6..8d4bfdd4 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_psychology"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
index a5babfe5..bc0c9d1a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_statistics"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
index 10306c2e..0524e767 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_us_history"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
index db7c1c11..86647362 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_world_history"
+"task": "mmlu_flan_n_shot_loglikelihood_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
index a3935d43..9d0a0179 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_human_aging"
+"task": "mmlu_flan_n_shot_loglikelihood_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
index 4672103c..96979867 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_human_sexuality"
+"task": "mmlu_flan_n_shot_loglikelihood_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
index be63a3c5..3d6eb6b2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_international_law"
+"task": "mmlu_flan_n_shot_loglikelihood_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
index 8e0a8191..0ef1cb18 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_jurisprudence"
+"task": "mmlu_flan_n_shot_loglikelihood_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
index 8c920895..b735d073 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_logical_fallacies"
+"task": "mmlu_flan_n_shot_loglikelihood_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
index f9aad4df..fccc7058 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"group": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_machine_learning"
+"task": "mmlu_flan_n_shot_loglikelihood_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
index 4709c403..a40da661 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_management"
+"task": "mmlu_flan_n_shot_loglikelihood_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
index 808f1c78..3537a86b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_marketing"
+"task": "mmlu_flan_n_shot_loglikelihood_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
index 3c0a99f8..49247525 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_medical_genetics"
+"task": "mmlu_flan_n_shot_loglikelihood_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
index c363f1bd..c6aa9baf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_miscellaneous"
+"task": "mmlu_flan_n_shot_loglikelihood_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
index d710816f..4ff46f42 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_moral_disputes"
+"task": "mmlu_flan_n_shot_loglikelihood_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
index 7d26770c..cdcc3b01 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_moral_scenarios"
+"task": "mmlu_flan_n_shot_loglikelihood_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
index 677185b3..5773ca1f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_nutrition"
+"task": "mmlu_flan_n_shot_loglikelihood_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
index 8c4b6f22..944b44a1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_philosophy"
+"task": "mmlu_flan_n_shot_loglikelihood_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
index 64065a6f..184a9584 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_prehistory"
+"task": "mmlu_flan_n_shot_loglikelihood_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
index 4fb590f8..129b7a50 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_accounting"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
index 581b9da7..d28f1935 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_law"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
index c49f9119..68c8dc46 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_medicine"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
index 2d6f441d..72481ae0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_psychology"
+"task": "mmlu_flan_n_shot_loglikelihood_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
index 3d330fc9..73280df5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_public_relations"
+"task": "mmlu_flan_n_shot_loglikelihood_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
index 8bbe963f..03b27c90 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_security_studies"
+"task": "mmlu_flan_n_shot_loglikelihood_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
index 0cc86bcc..630d1692 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_sociology"
+"task": "mmlu_flan_n_shot_loglikelihood_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
index 12ac4f36..2274d067 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
+"task": "mmlu_flan_n_shot_loglikelihood_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
index 6e942396..8bd36105 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"group": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_virology"
+"task": "mmlu_flan_n_shot_loglikelihood_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
index 30f97421..e59c2849 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"group": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_generative_world_religions"
+"task": "mmlu_flan_n_shot_loglikelihood_world_religions"
-- 
GitLab


From 7760573f7de555c6d8e8fba0a5be272f309d426f Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 14 Nov 2023 13:26:14 +0000
Subject: [PATCH 191/212] enables tasks of different group but same task_alis
 (for example, if evaluating on different versions of MMLU

---
 lm_eval/evaluator.py | 15 ++++-----------
 lm_eval/utils.py     |  4 ++++
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 3fa9633e..9183ce5b 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -562,8 +562,6 @@ def evaluate(
             task_hierarchy, task_order, versions, task_group_alias
         )
 
-        _results_agg = collections.defaultdict(dict)
-        _versions = collections.defaultdict(dict)
         for task in results_agg:
             task_results = results_agg[task]
 
@@ -577,13 +575,9 @@ def evaluate(
 
             if task in task_group_alias:
                 task_alias = task_group_alias[task]
-                _results_agg[tab_string + task_alias] = task_results
-                _versions[tab_string + task_alias] = versions[task]
+                results_agg[task]["alias"] = tab_string + task_alias
             else:
-                _results_agg[tab_string + task] = task_results
-                _versions[tab_string + task] = versions[task]
-        results_agg = _results_agg
-        versions = _versions
+                results_agg[task]["alias"] = tab_string + task
 
         _groups_agg = collections.defaultdict(dict)
         for group in groups_agg:
@@ -599,10 +593,9 @@ def evaluate(
 
             if group in task_group_alias:
                 group_alias = task_group_alias[group]
-                _groups_agg[tab_string + group_alias] = group_results
+                groups_agg[group]["alias"] = tab_string + group_alias
             else:
-                _groups_agg[tab_string + group] = group_results
-        groups_agg = _groups_agg
+                groups_agg[group]["alias"] = tab_string + group
 
         results_dict = {
             "results": dict(results_agg.items()),
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index d246470a..fc2874fc 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -305,6 +305,10 @@ def make_table(result_dict, column: str = "results"):
 
     for k, dic in result_dict[column].items():
         version = result_dict["versions"][k]
+
+        if "alias" in dic:
+            k = dic.pop("alias")
+
         for (mf), v in dic.items():
             m, _, f = mf.partition(",")
             if m.endswith("_stderr"):
-- 
GitLab


From d9230aa8868f71ad8f68cdce7463cece3be67ec7 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Tue, 14 Nov 2023 13:27:19 +0000
Subject: [PATCH 192/212] removed unused variables

---
 lm_eval/evaluator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 9183ce5b..8c64e5c4 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -579,7 +579,6 @@ def evaluate(
             else:
                 results_agg[task]["alias"] = tab_string + task
 
-        _groups_agg = collections.defaultdict(dict)
         for group in groups_agg:
             group_results = groups_agg[group]
 
-- 
GitLab


From c9a3fd3f677cf063deca35ca0222d3aa8d725e41 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Wed, 15 Nov 2023 12:04:42 +0000
Subject: [PATCH 193/212] update squad file

---
 lm_eval/tasks/__init__.py                   |  2 +-
 lm_eval/tasks/{squad.py => squadv2/task.py} | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
 rename lm_eval/tasks/{squad.py => squadv2/task.py} (93%)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 37c9051e..98363e83 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -16,7 +16,7 @@ from lm_eval.api.registry import (
 import logging
 
 # import python tasks
-from .squad import SQuAD2
+from .squadv2.task import SQuAD2
 from .scrolls.task import (
     QuALITY,
     NarrativeQA,
diff --git a/lm_eval/tasks/squad.py b/lm_eval/tasks/squadv2/task.py
similarity index 93%
rename from lm_eval/tasks/squad.py
rename to lm_eval/tasks/squadv2/task.py
index 84abafc8..7a38cae1 100644
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -14,6 +14,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 import datasets
+from evaluate import load
 
 from math import exp
 from functools import partial
@@ -36,6 +37,7 @@ _CITATION = """
 
 
 def _squad_metric(predictions, references):
+    # squad_metric = load("squad_v2")
     squad_metric = datasets.load_metric("squad_v2")
     return squad_metric.compute(predictions=predictions, references=references)
 
@@ -46,6 +48,20 @@ def _squad_agg(key, items):
     return _squad_metric(predictions=predictions, references=references).get(key, 0)
 
 
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
 @register_task("squadv2")
 class SQuAD2(Task):
     VERSION = 1
-- 
GitLab


From a745d589174ff436aa8d2a5dadd1cd49c4812988 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 16 Nov 2023 15:52:48 +0000
Subject: [PATCH 194/212] remove provide_description flag

---
 lm_eval/api/task.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 9fc33fcc..860e82b1 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -465,18 +465,8 @@ class Task(abc.ABC):
         assert (
             rnd is not None
         ), "A `random.Random` generator argument must be provided to `rnd`"
-        assert not provide_description, (
-            "The `provide_description` arg will be removed in future versions. To prepend "
-            "a custom description to the context, supply the corresponding string via the "
-            "`description` arg."
-        )
-        if provide_description is not None:
-            # nudge people to not specify it at all
-            print(
-                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
-            )
 
-        description = description + "\n\n" if description else ""
+        description = description if description else ""
 
         if num_fewshot == 0:
             labeled_examples = ""
-- 
GitLab


From 1eed1cb5c73cbf8d2effd89be9b564e56bae7b8d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 16 Nov 2023 16:43:31 +0000
Subject: [PATCH 195/212] update gsm8k name

---
 tests/models/test_huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 1fd9464a..3bb0172a 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -15,7 +15,7 @@ class Test_HFLM:
     multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
     multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
     MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
     generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
     generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
     generate_until: list[Instance] = generate_until_task.instances
-- 
GitLab


From 57ce579c2da7fa672146d18c5b67f2926c7f6f17 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 16 Nov 2023 19:20:04 +0000
Subject: [PATCH 196/212] widen HF model test tolerance

---
 tests/models/test_huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 3bb0172a..914288f0 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -115,7 +115,7 @@ class Test_HFLM:
 
     def test_logliklihood_rolling(self) -> None:
         res = self.LM.loglikelihood_rolling(self.ROLLING)
-        assert np.allclose(res, self.ROLLING_RES, atol=1e-2)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
 
     def test_toc_encode(self) -> None:
         res = self.LM.tok_encode("foo bar")
-- 
GitLab


From 10cc0a56886324e72310f50ee0114c021453c792 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 06:44:14 +0000
Subject: [PATCH 197/212] edits and format

---
 lm_eval/__init__.py              | 3 +--
 lm_eval/__main__.py              | 3 ++-
 lm_eval/evaluator.py             | 3 +--
 lm_eval/models/anthropic_llms.py | 1 +
 lm_eval/models/huggingface.py    | 1 +
 lm_eval/tasks/__init__.py        | 1 +
 lm_eval/utils.py                 | 1 +
 7 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lm_eval/__init__.py b/lm_eval/__init__.py
index 323f916f..317c0291 100644
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
@@ -1,2 +1 @@
-# from .evaluator import evaluate, simple_evaluate
-# from .logger import eval_logger, SPACING
+from .evaluator import evaluate, simple_evaluate
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index bfcd431e..c301f9f7 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -13,6 +13,7 @@ from lm_eval import evaluator, utils
 from lm_eval.tasks import initialize_tasks, include_path
 from lm_eval.api.registry import ALL_TASKS
 
+
 def _handle_non_serializable(o):
     if isinstance(o, np.int64) or isinstance(o, np.int32):
         return int(o)
@@ -163,7 +164,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                 missing = ", ".join(task_missing)
                 eval_logger.error(
                     f"Tasks were not found: {missing}\n"
-                    f"{' ' * 47}Try `lm-eval --tasks list` for list of available tasks",
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                 )
                 raise ValueError(
                     f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 07853cc8..9ac2c760 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -20,7 +20,7 @@ from lm_eval.utils import (
     make_table,
     create_iterator,
     get_git_commit_hash,
-    eval_logger
+    eval_logger,
 )
 
 
@@ -40,7 +40,6 @@ def simple_evaluate(
     decontamination_ngrams_path=None,
     write_out: bool = False,
     log_samples: bool = True,
-    verbosity: str = "INFO",
 ):
     """Instantiate and evaluate a model on a list of tasks.
 
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index fdff2382..18b1b70a 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -7,6 +7,7 @@ from typing import List, Any, Tuple
 
 eval_logger = utils.eval_logger
 
+
 def anthropic_completion(
     client,  #: anthropic.Anthropic,
     model: str,
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 8e6f5338..2ae707bb 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -26,6 +26,7 @@ from typing import List, Optional, Union
 
 eval_logger = utils.eval_logger
 
+
 def _get_accelerate_args(
     device_map_option: Optional[str] = "auto",
     max_memory_per_gpu: Optional[Union[int, str]] = None,
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index e8e40269..7b806ca3 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,6 +14,7 @@ from lm_eval.api.registry import (
 )
 
 import logging
+
 eval_logger = utils.eval_logger
 
 
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 65ce6162..88d9b9dc 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -30,6 +30,7 @@ eval_logger = logging.getLogger("lm-eval")
 
 SPACING = " " * 47
 
+
 def escaped_split(text, sep_char, maxsplit=-1):
     """Split text into a list on occurrences of the given separation
     character `sep_char`. The separation character may be escaped by a
-- 
GitLab


From 0d209e2e8be992fe9aac8fd0555e21687dde9dbe Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 07:00:10 +0000
Subject: [PATCH 198/212] removed provide_description arg

---
 lm_eval/api/task.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 860e82b1..94d0b041 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -441,7 +441,6 @@ class Task(abc.ABC):
         self,
         doc,
         num_fewshot,
-        provide_description=None,
         rnd=random.Random(1234),
         description=None,
     ):
@@ -452,8 +451,6 @@ class Task(abc.ABC):
             The document as returned from training_docs, validation_docs, or test_docs.
         :param num_fewshot: int
             The number of fewshot examples to provide in the returned context string.
-        :param provide_description: bool
-            Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
         :param rnd: random.Random
             The pseudo-random number generator used to randomly sample examples.
             WARNING: This is currently a required arg although it's optionalized with a default `None`.
-- 
GitLab


From f40b7d0efcc192ab54c78684b00ddf9e4ef37254 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 15:05:17 +0000
Subject: [PATCH 199/212] precommit

---
 lm_eval/tasks/scrolls/README.md |  2 +-
 lm_eval/tasks/squadv2/README.md | 54 +++++++++++++++++++++++++++++++++
 lm_eval/tasks/squadv2/task.py   | 14 ---------
 3 files changed, 55 insertions(+), 15 deletions(-)
 create mode 100644 lm_eval/tasks/squadv2/README.md

diff --git a/lm_eval/tasks/scrolls/README.md b/lm_eval/tasks/scrolls/README.md
index 92c84816..a90e00f4 100644
--- a/lm_eval/tasks/scrolls/README.md
+++ b/lm_eval/tasks/scrolls/README.md
@@ -28,4 +28,4 @@ Once the subset task class has been defined in this file, it can be used by addi
 to `lm_eval/tasks/__init__.py`.
 
 NOTE: GovReport may need `max_gen_toks` set larger for causal models.
-"""
\ No newline at end of file
+"""
diff --git a/lm_eval/tasks/squadv2/README.md b/lm_eval/tasks/squadv2/README.md
new file mode 100644
index 00000000..bad0c4e2
--- /dev/null
+++ b/lm_eval/tasks/squadv2/README.md
@@ -0,0 +1,54 @@
+# Task-name
+
+### Paper
+
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
+Abstract: https://arxiv.org/abs/1806.03822
+
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+
+
+### Citation
+
+```
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet
+
+#### Tasks
+
+* `squadv2`: `Default squadv2 task`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py
index 7a38cae1..74437a98 100644
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -48,20 +48,6 @@ def _squad_agg(key, items):
     return _squad_metric(predictions=predictions, references=references).get(key, 0)
 
 
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-    def remove_articles(text):
-        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-        return re.sub(regex, ' ', text)
-    def white_space_fix(text):
-        return ' '.join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
 @register_task("squadv2")
 class SQuAD2(Task):
     VERSION = 1
-- 
GitLab


From 2314ff4fdf6b9c6213154809f71b05881b70b0fe Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 15:22:41 +0000
Subject: [PATCH 200/212] will check if group_name is None

---
 lm_eval/evaluator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 611f17e5..7bdb7cff 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -226,6 +226,7 @@ def evaluate(
             versions[group_name] = "N/A"
 
         else:
+            group_name = None
             task_hierarchy[task_name] = []
 
         if task is None:
@@ -239,7 +240,7 @@ def evaluate(
 
         if ("group_alias" in configs[task_name]) and (
             group_name not in task_group_alias
-        ):
+        ) and (group_name != None):
             task_group_alias[group_name] = configs[task_name]["group_alias"]
 
         if limit is not None:
-- 
GitLab


From bf6cf1debe43c60012b448db006a58702b3d899e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 15:26:04 +0000
Subject: [PATCH 201/212] format

---
 lm_eval/evaluator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 7bdb7cff..aae4ebbd 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -238,9 +238,11 @@ def evaluate(
         if "task_alias" in configs[task_name]:
             task_group_alias[task_name] = configs[task_name]["task_alias"]
 
-        if ("group_alias" in configs[task_name]) and (
-            group_name not in task_group_alias
-        ) and (group_name != None):
+        if (
+            ("group_alias" in configs[task_name])
+            and (group_name not in task_group_alias)
+            and (group_name is not None)
+        ):
             task_group_alias[group_name] = configs[task_name]["group_alias"]
 
         if limit is not None:
-- 
GitLab


From 66eed4bbaf950a1bcfa20a5d25a54639225f0e80 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@sutawika.com>
Date: Fri, 17 Nov 2023 15:46:30 +0000
Subject: [PATCH 202/212] update script to use initialize_tasks

---
 scripts/write_out.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/write_out.py b/scripts/write_out.py
index cc15ad33..59fdba7c 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -4,9 +4,8 @@ import json
 import os
 import random
 from lm_eval import tasks
-from lm_eval.utils import join_iters
-from lm_eval.tasks import include_path
-from lm_eval.logger import eval_logger
+from lm_eval.utils import join_iters, eval_logger
+from lm_eval.tasks import initialize_tasks, include_path
 
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 
@@ -25,6 +24,12 @@ def parse_args():
         default=None,
         help="Additional path to include if there are external tasks to include.",
     )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
     return parser.parse_args()
 
 
@@ -32,6 +37,8 @@ def main():
     args = parse_args()
     np.random.seed(args.seed)
 
+    initialize_tasks(args.verbosity)
+
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
         include_path(args.include_path)
-- 
GitLab


From aa9e106235305ff54c43dd607dd04dc6174c371d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 17 Nov 2023 16:28:33 +0000
Subject: [PATCH 203/212] fix error when using wildcard task names

---
 lm_eval/__main__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index aaf98419..b9e2588d 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -149,7 +149,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                 if os.path.isfile(task):
                     config = utils.load_yaml_config(task)
                     task_names.append(config)
-            task_missing = [task for task in tasks_list if task not in task_names]
+            task_missing = [
+                task
+                for task in tasks_list
+                if task not in task_names and "*" not in task
+            ]
 
             if task_missing:
                 missing = ", ".join(task_missing)
-- 
GitLab


From 2444b36436b13d3b82a818e7318253868acd93a5 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 17 Nov 2023 16:29:09 +0000
Subject: [PATCH 204/212] fix error when using wildcard task names

---
 lm_eval/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index b9e2588d..380f03fa 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -153,7 +153,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                 task
                 for task in tasks_list
                 if task not in task_names and "*" not in task
-            ]
+            ]  # we don't want errors if a wildcard ("*") task name was used
 
             if task_missing:
                 missing = ", ".join(task_missing)
-- 
GitLab


From 487767ab0d0a5fc8d817dd114a5a385f70f50535 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 17 Nov 2023 16:30:25 +0000
Subject: [PATCH 205/212] update bigbench datasets to mirror, rename groupnames

---
 lm_eval/tasks/bigbench/generate_until_template_yaml  | 4 ++--
 lm_eval/tasks/bigbench/multiple_choice_template_yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lm_eval/tasks/bigbench/generate_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml
index ebce0377..198e1c47 100644
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
@@ -1,5 +1,5 @@
-group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+group: bigbench_generate_until
+dataset_path: hails/bigbench
 output_type: generate_until
 dataset_kwargs:
   # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
index 3de7b5b7..b51310e8 100644
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -1,5 +1,5 @@
-group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+group: bigbench_multiple_choice
+dataset_path: hails/bigbench
 dataset_kwargs:
   # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
   # subtask_name: null
-- 
GitLab


From 032820ddba95956f4b30ca98babe7bdc9bede523 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 17 Nov 2023 19:30:56 +0000
Subject: [PATCH 206/212] initialize tasks in test files

---
 tests/models/test_huggingface.py | 2 ++
 tests/test_evaluator.py          | 1 +
 tests/test_tasks.py              | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 914288f0..557ad051 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks
 import sys
 import torch
 
+tasks.initialize_tasks()
+
 
 class Test_HFLM:
     torch.use_deterministic_algorithms(True)
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index ccd2f850..7f30e21f 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -11,6 +11,7 @@ from typing import List
 import random
 import pytest
 
+tasks.initialize_tasks()
 
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index 70875469..41504430 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -4,7 +4,7 @@ from .utils import new_tasks
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask
 
-
+tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]
 
-- 
GitLab


From 9b596e8f9155064cdab7630633c92711372a39bb Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 17 Nov 2023 19:32:50 +0000
Subject: [PATCH 207/212] add initialize_tasks() to interface guide in docs

---
 docs/interface.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/interface.md b/docs/interface.md
index 36353e7f..432b0f61 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -59,6 +59,8 @@ my_model = initialize_my_model() # create your model (could be running finetunin
 ...
 lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
+lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
+
 results = lm_eval.simple_evaluate( # call simple_evaluate
     model=lm_obj,
     tasks=["taskname1", "taskname2"],
@@ -85,7 +87,7 @@ my_model = initialize_my_model() # create your model (could be running finetunin
 ...
 lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
-
+lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
 
 def evaluate(
     lm=lm_obj,
-- 
GitLab


From f87b864f27c982cab50a269f1f6db15fb8d4c8a8 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Mon, 20 Nov 2023 20:28:17 +0000
Subject: [PATCH 208/212] add ggml tests

---
 tests/models/test_gguf.py                     | 151 ++++++++++++++++++
 ...476269826cd3b54262f7a0981f75ddd45b25d0.pkl | Bin 0 -> 153 bytes
 ...824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl | Bin 0 -> 153 bytes
 ...cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl | Bin 0 -> 532 bytes
 4 files changed, 151 insertions(+)
 create mode 100644 tests/models/test_gguf.py
 create mode 100644 tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
 create mode 100644 tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
 create mode 100644 tests/testdata/gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl

diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py
new file mode 100644
index 00000000..6d186676
--- /dev/null
+++ b/tests/models/test_gguf.py
@@ -0,0 +1,151 @@
+import unittest
+from unittest.mock import patch
+import hashlib
+import json
+import os
+import pickle
+from lm_eval.models.gguf import GGUFLM
+
+from lm_eval.api.instance import Instance
+
+base_url = "https://matthoffner-ggml-llm-api.hf.space"
+
+
+def gguf_completion_mock(base_url=None, **kwargs):
+    # Generate a hash from the parameters
+    hash_kwargs = {"base_url": base_url, **kwargs}
+    hash = hashlib.sha256(
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+
+    fname = f"./tests/testdata/gguf_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    else:
+        print("The file does not exist, attempting to write...")
+        if "stop" in kwargs:
+            result = {
+                "choices": [
+                    {
+                        "text": f"generated text until {kwargs['stop']}",
+                        "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
+                        "finish_reason": "length",
+                    }
+                ]
+            }
+        else:
+            # generated with # curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
+            result = {
+                "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
+                "object": "text_completion",
+                "created": 1700511361,
+                "model": "../llama-2-7b.Q8_0.gguf",
+                "choices": [
+                    {
+                        "text": "string(",
+                        "index": 0,
+                        "logprobs": {
+                            "text_offset": [0, 7],
+                            "token_logprobs": [None, -1.033263319857306],
+                            "tokens": [" string", "("],
+                            "top_logprobs": [
+                                None,
+                                {
+                                    "(": -1.033263319857306,
+                                    "[]": -2.6530743779017394,
+                                    ".": -3.0377145947291324,
+                                    "\n": -3.0399156750513976,
+                                    "_": -3.510376089937872,
+                                    " =": -3.6957918347193663,
+                                    ",": -3.9309459866358702,
+                                    " of": -4.2834550083949035,
+                                    '("': -4.322762841112799,
+                                    "()": -4.426229113466925,
+                                },
+                            ],
+                        },
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 2,
+                    "completion_tokens": 1,
+                    "total_tokens": 3,
+                },
+            }
+
+        try:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            print("Writing file at", fname)
+            with open(fname, "wb") as fh:
+                pickle.dump(result, fh)
+            print("File written successfully")
+        except Exception as e:
+            print("File writing failed:", e)
+
+        return result
+
+
+class GGUFLMTest(unittest.TestCase):
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_loglikelihood(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test loglikelihood
+        requests = [
+            Instance(
+                request_type="loglikelihood",
+                doc=args,
+                arguments=args,
+                idx=i,
+            )
+            for i, args in enumerate([("str", "ing"), ("str", "ing")])
+        ]
+        res = lm.loglikelihood(requests)
+
+        # Assert the loglikelihood response is correct
+        expected_res = [(logprob, True) for logprob in [0, 0]]
+        self.assertEqual(res, expected_res)
+
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_generate_until(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test generate_until
+        requests = [
+            Instance(
+                request_type="generate_until",
+                doc={"input": doc},
+                arguments=(doc, {"until": stop}),
+                idx=i,
+            )
+            for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
+        ]
+
+        res = lm.generate_until(requests)
+
+        # Assert the generate_until response is correct
+        expected_res = ["generated text until stop1", "generated text until stop2"]
+        self.assertEqual(res, expected_res)
+
+    # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    # def test_loglikelihood_rolling(self, gguf_completion_mock):
+    #     lm = GGUFLM(base_url)
+
+    #     # Test loglikelihood_rolling
+    #     requests = ["input1", "input2"]
+    #     res = lm.loglikelihood_rolling(requests)
+
+    #     # Assert the loglikelihood_rolling response is correct
+    #     expected_res = [(-1.2345, True), (-1.2345, True)]
+    #     self.assertEqual(res, expected_res)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl b/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..55770de543fcf5b2106b9f8d92a4922b53d6882b
GIT binary patch
literal 153
zcmZo*ncBwy0ku<l*poBzGm}$`r^HUFoubjhQj%IxGNnf<JvA@2D6u3pMFAwBP?}eg
znWIo#l3!prrH3OYKfRzRKdBh3gRdk%J2fvJE*3k*egEeJ%|dz8dlP%OLAvAf)6$Al
nOQv`;l=kqZW#(lTXT%q!CKl)CP3d9FNzF?y$(T}_SgZ#CS7JO}

literal 0
HcmV?d00001

diff --git a/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl b/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..ee5803d9c454fbbe85946a61a70b94cff798bce1
GIT binary patch
literal 153
zcmZo*ncBwy0ku<l*poBzGm}$`r^HUFoubjhQj%IxGNnf<JvA@2D6u3pMFAwBP?}eg
znWIo#l3!pnrH3OYKfRzRKdBh3gRdk%J2fvJE*3k*egEeJ%|dz8dlP%OLAvAf)6$Al
nOQv`;l=kqZW#(lTXT%q!CKl)CP3d9FNzF?y$(T}_SgZ#CSG_!5

literal 0
HcmV?d00001

diff --git a/tests/testdata/gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl b/tests/testdata/gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2284ed6edc83debe352b1abdf4702e716e37b83f
GIT binary patch
literal 532
zcmY+BJ5R$f6ouRL#j8{ds7$Eplu9I&wls7=AQmJlK!t$;p~y*Z(m)zljU%xj9uvbI
zkT4T7|A3h?vGP~gS+G-7gtzaWdwj2tZ_-cGN+Ld2Tt&s}a;(_>fv+{HwR+2FTAF1y
ziPorFRgJ8Zme#DboJMV}X&X+RW0qRyU^9*d2A9mVsjvd`XuvUN4}mb~^1a))Q@~80
zcERUZ)b&;0Cw-#TG{e$68)j8^-N=!K9`$Sp`5BkHq~(pG8nU4mxGKkt7j)s0w-Y$!
zQ+F_=Rv0(V#-pj@gusMXZ713}z<EY5ATTGLLO9qRecf*@)1?={Y%I%JD&|V#861-;
z2b^Pc@ZTra=@lntR8G(M?t7+nT`=$nle&PZ{mnxIk1;tTp)(R76jQ+5;&$Aolr{zB
zAHK)Zih#oK@l*DtO4Jd!P~OQse^gLW%aZo)PR3M`^cuN>B8V{Wc!3x8Op$IVbI$q@
pxU9z`f|)2JF2teaMLzujGk>4iR&e3(b0*cKd@LjWq>zdt{Rg?E#P9$B

literal 0
HcmV?d00001

-- 
GitLab


From f7873a494520f0bd2251917ec6cdfebd524b65d6 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 21 Nov 2023 16:28:52 +0000
Subject: [PATCH 209/212] update multi-token stopsequence handling

---
 lm_eval/models/huggingface.py |  4 +---
 lm_eval/utils.py              | 10 ++++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 2ae707bb..7c071e7f 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -889,8 +889,6 @@ class HFLM(LM):
                     max_gen_toks = kwargs.pop("max_gen_toks")
                 else:
                     max_gen_toks = self.max_gen_toks
-                # first stop sequence is used to halt generation upon encountering
-                primary_until = [until[0]]
 
                 # set the max length in tokens of inputs ("context_enc")
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
@@ -916,7 +914,7 @@ class HFLM(LM):
                 cont = self._model_generate(
                     context=context_enc,
                     attention_mask=attn_masks,
-                    stop=primary_until,
+                    stop=until,
                     **kwargs,
                 )
 
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 88d9b9dc..ee2b281c 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -579,7 +579,14 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
         self.done_tracker = [False] * batch_size
         self.sequence = sequence
         self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.sequence_id_len = len(self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        self.sequence_id_len = len(self.sequence_ids) + 2
         self.tokenizer = tokenizer
 
     def __call__(self, input_ids, scores, **kwargs) -> bool:
@@ -589,7 +596,6 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
         ]
 
         lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-
         for i, done in enumerate(self.done_tracker):
             if not done:
                 self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
-- 
GitLab


From a1403c8f8c1bc7bfc8a854069413a37940de9c14 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Tue, 21 Nov 2023 17:19:08 -0500
Subject: [PATCH 210/212] Update model_guide.md

---
 docs/model_guide.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/model_guide.md b/docs/model_guide.md
index 10c58e06..c06c2770 100644
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -1,7 +1,5 @@
 # New Model Guide
 
-The `lm-evaluation-harness` is intended to be a model-agnostic framework for evaluating . We provide first-class support for HuggingFace `AutoModelForCausalLM` and `AutoModelForSeq2SeqLM` type models, but
-
 This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model.
 
 In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library!
-- 
GitLab


From d3e46bb3f27842a21c0fbdeb1b74cc2968d3846f Mon Sep 17 00:00:00 2001
From: baberabb <92168766+baberabb@users.noreply.github.com>
Date: Thu, 23 Nov 2023 19:25:16 +0500
Subject: [PATCH 211/212] Update README with additional usage instructions.

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 8fc81fb6..2a5ac81c 100644
--- a/README.md
+++ b/README.md
@@ -216,6 +216,11 @@ python -m lm_eval \
 
 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
 
+To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
+
+Additionally provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
+
+For a full list supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) guide!
 
 ## How to Contribute or Learn More?
 
-- 
GitLab


From 4fd6a731338fb233357cc54d77d1aba95f42ec52 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Thu, 23 Nov 2023 21:12:41 -0500
Subject: [PATCH 212/212] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2a5ac81c..65f4e28a 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ python -m lm_eval \
     --check_integrity
 ```
 
-## Advanced Usage
+## Advanced Usage Tips
 
 For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
 ```bash
@@ -218,9 +218,9 @@ We support wildcards in task names, for example you can run all of the machine-t
 
 To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
 
-Additionally provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
+Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
 
-For a full list supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) guide!
+For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) guide in our documentation!
 
 ## How to Contribute or Learn More?
 
-- 
GitLab