diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 4f105d094bce82c638b29c8e0a2a3023007291a8..59d5e6d141b25ddcdcbd7f943e76ad2bc1655e5b 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -43,7 +43,7 @@ jobs:
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-Job 2
+# Job 2
   testcpu:
     name: CPU Tests
     runs-on: ubuntu-latest
diff --git a/CODEOWNERS b/CODEOWNERS
index 9e6375466f1419af1e4c3450c5fbf7934f89b82f..35ca63fe12e3c52a379bae5af259d30640f83d9b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-* @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika @StellaAthena
diff --git a/README.md b/README.md
index 62dc36e07de46526b81118f45d053850a691a9d5..92af820c3bd80e5c008895343690240ee7b10c3b 100644
--- a/README.md
+++ b/README.md
@@ -1,34 +1,24 @@
 # Language Model Evaluation Harness
 
-## Notice to Users
-(as of 6/15/23)
-We have a revamp of the Evaluation Harness library internals staged on the [big-refactor](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) branch! It is far along in progress, but before we start to move the `master` branch of the repository over to this new design with a new version release, we'd like to ensure that it's been tested by outside users and there are no glaring bugs.
-
-We’d like your help to test it out! you can help by:
-1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
-2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
-
-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
-- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
-- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
-
-Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
-
 ## Overview
 
 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
 
 Features:
 
-- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
+- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
-- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
-- Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
+- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
+- Support for local models and benchmarks.
+- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
+
+The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and is used internally by dozens of companies including NVIDIA, Cohere, Booz Allen Hamilton, and Mosaic ML.
+
 
 ## Install
 
-To install the `lm-eval` refactor branch from the github repository, run:
+To install the `lm-eval` package from the github repository, run:
 
 ```bash
 git clone https://github.com/EleutherAI/lm-evaluation-harness
@@ -54,7 +44,6 @@ To install the package with all extras, run
 pip install -e ".[all]"
 ```
 
-
 ## Support
 
 The best way to get support is to open an issue on this repo or join the EleutherAI discord server](discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases.
@@ -86,7 +75,7 @@ python -m lm_eval \
     --batch_size 8
 ```
 
-Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) or `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported via  Support for this model type is currently pending.
+Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supporteded.
 
 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
 
@@ -151,14 +140,14 @@ A full accounting of the supported and planned libraries + APIs can be seen belo
 
 | API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                    | Request Types:                                           |
 |-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `greedy_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `greedy_until` (no logprobs)                             |
-| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `greedy_until` (no logprobs)                             |
+| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions      | :x: Not yet - needs testing!       | N/A                                                                              | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                         | `generate_until` (no logprobs)                             |
+| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `generate_until` (no logprobs)                             |
+| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `generate_until` (no logprobs)                             |
 | Your inference server here! | ...                             | ...                                                                              | ...                                  | ...                                                      |                                | ...                                                      |
 
 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
@@ -227,12 +216,6 @@ python -m lm_eval \
 
 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
 
-## Implementing new tasks
-
-To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
-
-
-As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md and https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md and welcome contributions of novel task templates and task variants.
 
 ## How to Contribute or Learn More?
 
@@ -241,35 +224,19 @@ For more information on the library and how everything fits together, check out
 
 You can also ask for help, or discuss new features with the maintainers in the #lm-thunderdome channel of the EleutherAI discord! If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!
 
+### Implementing new tasks
+
+To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
+
+
+As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in [the task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md) and [the advanced task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md) and welcome contributions of novel task templates and task variants.
+
 
 ## Cite as
 
 ```
-@software{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Abbasi, Baber and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  Le Noac'h, Alain and
-                  Li, Haonan and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Ociepa, Chris
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Schoelkopf, Hailey and
-                  Skowron, Aviya and
-                  Sutawika, Lintang and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
   title        = {A framework for few-shot language model evaluation},
   month        = sep,
   year         = 2021,
diff --git a/docs/interface.md b/docs/interface.md
index 860dd1c01ee4252d2bd7d3a494649d54f1632211..36353e7fa1716c6e42dc41328f4e478995c88ccf 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -57,7 +57,7 @@ import lm_eval
 
 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
 results = lm_eval.simple_evaluate( # call simple_evaluate
     model=lm_obj,
@@ -83,7 +83,7 @@ from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task
 
 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
 
 
 
diff --git a/docs/model_guide.md b/docs/model_guide.md
index cf79dd77b8da0cd49cadf4264615f5727338bf4e..10c58e06540c7875b63bb35b85d93dc0b1e7f1c1 100644
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -44,28 +44,49 @@ class MyCustomLM(LM):
         #...
 
 
-    def greedy_until(self, requests: list[Instance]) -> list[str]:
+    def generate_until(self, requests: list[Instance]) -> list[str]:
         #...
     #...
 ```
-Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
 
-We support
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
 
-The three types of
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
 
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated input+output text from the model will then be returned.
 
+- `loglikelihood`
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )
 
-smth smth tokenizer-agnostic
+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
 
-3 reqtypes
-- greedy_until, and the arguments passed to it
 
-- loglikelihood, and args passed to it
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` !
 
-- loglikelihood_rolling, and args passed to it
+**Tip: be careful of indexing in loglikelihood!**
 
 
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+
+```
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```
+
+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .
+
 ## Registration
 
 Congrats on implementing your model! Now it's time to test it out.
@@ -83,7 +104,9 @@ class MyCustomLM(LM):
 
 Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!
 
+## Testing
 
+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
 
 ## Other
 
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index e0ccc6a3ee13f07e5b7fb2e39f2dcc8a16c1c74c..86966be54140738e4a3e474d53fdd17db17dbefd 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -17,7 +17,7 @@ git checkout -b <task-name>
 pip install -e ".[dev]"
 ```
 
-As a concrete example, we'll walk through reimplementing the `gsm8k` benchmark (a *generative* task which requires sampling text from a model) and the `sciq` benchmark. (a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices).
+In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/sciq/sciq.yaml).
 
 ## Creating a YAML file
 
@@ -45,6 +45,16 @@ dataset_name: ... # the dataset configuration to use. Leave `null` if your datas
 dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
 ```
 
+------------------------------
+**Tip:** To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+```
+dataset_path: json
+dataset_name: null
+dataset_kwargs:
+  data_files: /path/to/my/json
+```
+-------------------------------
+
 Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:
 
 ```yaml
@@ -116,7 +126,7 @@ doc_to_choice: ['No', 'Yes']
 
 We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.
 
-Take for example `super_glue/boolq`, as input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
+Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
 ```
 doc["passage"]
 Question: doc["question"]?
@@ -285,7 +295,7 @@ It's now time to check models' performance on your task! In the evaluation harne
 
 To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented.
 
-### Task impl. checklist
+### Task Validity Checklist
 
 The checklist is the following:
 
diff --git a/docs/task_guide.md b/docs/task_guide.md
index 3e15fd9fb7439d8443bf62e35db40c2a3e7e0345..30ae038fddeefc78c955eba221945c9fc2f1f755 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -20,19 +20,19 @@ Task naming + registration:
 
 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
-- **dataset_name**  (`str`, *optional*, defaults to None) — The name of, what HF calls, a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
+- **dataset_name**  (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
 - **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
 - **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
 - **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
 - **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
-- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0.
 - **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template.
 
 Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
-- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `greedy_until` tasks.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
 
@@ -42,7 +42,7 @@ Runtime configuration options:
 
 Scoring details:
 - **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
-- **output_type** (`str`, *optional*, defaults to "greedy_until") — Selects the type of model output for the given task. Options are `greedy_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
 - **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
 - **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
 - **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
diff --git a/ignore.txt b/ignore.txt
index cde618d08102b33c38b18edcfed1872659dfab81..de10b539b98c9e500d2d838ed3eb9bece95c00e2 100644
--- a/ignore.txt
+++ b/ignore.txt
@@ -5,3 +5,4 @@ maka
 mor
 te
 ond
+extraversion
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index b8f0942af977f46e0b4a029a270706308c3606b5..aaf98419a591dc95b2a33cd87e68d753b58614fd 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -2,11 +2,10 @@ import os
 import re
 import json
 import fnmatch
-import jsonlines
 import argparse
 import logging
 from pathlib import Path
-
+import numpy as np
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
@@ -15,6 +14,15 @@ from lm_eval.tasks import include_path
 from typing import Union
 
 
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
 def parse_eval_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
@@ -97,6 +105,12 @@ def parse_eval_args() -> argparse.Namespace:
         default=None,
         help="Additional path to include if there are external tasks to include.",
     )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
     return parser.parse_args()
 
 
@@ -105,6 +119,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         # we allow for args to be passed externally, else we parse them ourselves
         args = parse_eval_args()
 
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     if args.limit:
@@ -112,7 +127,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             " --limit SHOULD ONLY BE USED FOR TESTING."
             "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
-
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
         include_path(args.include_path)
@@ -188,7 +202,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
     if results is not None:
         if args.log_samples:
             samples = results.pop("samples")
-        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
+        dumped = json.dumps(results, indent=2, default=_handle_non_serializable)
         if args.show_config:
             print(dumped)
 
@@ -203,9 +217,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                         re.sub("/|=", "__", args.model_args), task_name
                     )
                     filename = path.joinpath(f"{output_name}.jsonl")
-
-                    with jsonlines.open(filename, "w") as f:
-                        f.write_all(samples[task_name])
+                    samples_dumped = json.dumps(
+                        samples[task_name], indent=2, default=_handle_non_serializable
+                    )
+                    filename.open("w").write(samples_dumped)
 
         print(
             f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
diff --git a/lm_eval/api/instance.py b/lm_eval/api/instance.py
index f3e7f005c96b103088ea9d32e50127292148b67a..7d3c23aa11c1b25c4b2f38006151e4d59601e1e3 100644
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,7 +4,7 @@ from typing import Literal, Tuple
 
 @dataclass
 class Instance:
-    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
     doc: dict
     arguments: tuple
     idx: int
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 16d9a1431b9033eea9218d1408e6346735309f29..be4d6f0bcbfb399f3207f8ba505058099d9a97e4 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -5,6 +5,7 @@ import numpy as np
 import sacrebleu
 import sklearn.metrics
 import random
+import evaluate
 
 from lm_eval.api.registry import register_metric, register_aggregation
 
@@ -105,6 +106,25 @@ def ter(items):
     return sacrebleu.corpus_ter(preds, refs).score
 
 
+@register_aggregation("brier_score")
+def brier_score(items):  # This is a passthrough function
+    gold, predictions = list(zip(*items))
+    gold = list(gold)
+    gold_one_hot = np.eye(np.max(gold) + 1)[gold]
+    predictions = list(zip(*items))[1]
+    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
+
+
+@register_metric(
+    metric="brier_score",
+    higher_is_better=False,
+    output_type=["multiple_choice"],
+    aggregation="brier_score",
+)
+def brier_score_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
     metric="acc",
     higher_is_better=True,
@@ -135,6 +155,19 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     return items
 
 
+exact_match = evaluate.load("exact_match")
+
+
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match.compute(**kwargs)
+
+
 @register_metric(
     metric="perplexity",
     higher_is_better=False,
@@ -212,7 +245,7 @@ def f1_fn(items):  # This is a passthrough function
 @register_metric(
     metric="bleu",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="bleu",
 )
 def bleu_fn(items):  # This is a passthrough function
@@ -222,7 +255,7 @@ def bleu_fn(items):  # This is a passthrough function
 @register_metric(
     metric="chrf",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="chrf",
 )
 def chrf_fn(items):  # This is a passthrough function
@@ -232,7 +265,7 @@ def chrf_fn(items):  # This is a passthrough function
 @register_metric(
     metric="ter",
     higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
     aggregation="ter",
 )
 def ter_fn(items):  # This is a passthrough function
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 42ef1c74549107cbd3fcb32233c2252b2be56352..c24026ac29851bc653fa9ce3ca6b6ed1a54cd375 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -96,7 +96,7 @@ class LM(abc.ABC):
 
     # TODO: Add an optional max length
     @abc.abstractmethod
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         """Generate greedily until a stopping sequence
 
         :param requests: list[Instance]
@@ -211,12 +211,12 @@ class CachingLM:
             )
             for req in tqdm(requests):
                 hsh = hash_args(attr, req.args)
-                if attr == "greedy_until" and req.args[1].get("do_sample", False):
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
                     # when we are doing non-greedy generation, don't use the cache
                     # (else every "randomly sampled" generation would be identical for repeats > 1).
                     if not warned:
                         eval_logger.warning(
-                            f"Arguments to lm.greedy_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
                         )
                         warned = True
                     res.append(None)
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 53e5771aba0421525899c89c54c441302ae70160..4e78048b642ec94b530ec85dca79d0d0198e33a3 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -68,10 +68,10 @@ def register_group(name):
     return decorate
 
 
-AGGREGATION_REGISTRY = {}
-DEFAULT_AGGREGATION_REGISTRY = {}
-METRIC_REGISTRY = {}
 OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY = {}
 HIGHER_IS_BETTER_REGISTRY = {}
 
 DEFAULT_METRIC_REGISTRY = {
@@ -81,7 +81,7 @@ DEFAULT_METRIC_REGISTRY = {
     ],
     "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
     "multiple_choice": ["acc", "acc_norm"],
-    "greedy_until": ["exact_match"],
+    "generate_until": ["exact_match"],
 }
 
 
@@ -95,8 +95,7 @@ def register_metric(**args):
         for key, registry in [
             ("metric", METRIC_REGISTRY),
             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
-            # ("output_type", OUTPUT_TYPE_REGISTRY),
-            ("aggregation", DEFAULT_AGGREGATION_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
         ]:
 
             if key in args:
@@ -158,12 +157,13 @@ def get_aggregation(name):
         )
 
 
-def get_default_aggregation(metric_name):
+def get_metric_aggregation(name):
+
     try:
-        return DEFAULT_AGGREGATION_REGISTRY[metric_name]
+        return METRIC_AGGREGATION_REGISTRY[name]
     except KeyError:
         eval_logger.warning(
-            f"No default aggregation metric for metric '{metric_name}'!"
+            "{} metric is not assigned a default aggregation!".format(name),
         )
 
 
@@ -171,7 +171,6 @@ def is_higher_better(metric_name):
     try:
         return HIGHER_IS_BETTER_REGISTRY[metric_name]
     except KeyError:
-        raise Warning(f"higher_is_better not specified for metric '{metric_name}'!")
         eval_logger.warning(
             f"higher_is_better not specified for metric '{metric_name}'!"
         )
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index d0f7d14b6cae4024b21574a3563d045c7e5450bc..c35808d7e189993d8e6147ffaadc19d6a1d702f5 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -33,7 +33,7 @@ from lm_eval.api.metrics import (
 from lm_eval.api.registry import (
     get_metric,
     get_aggregation,
-    get_default_aggregation,
+    get_metric_aggregation,
     is_higher_better,
     DEFAULT_METRIC_REGISTRY,
     OUTPUT_TYPE_REGISTRY,
@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [
     "loglikelihood",
     "multiple_choice",
     "loglikelihood_rolling",
-    "greedy_until",
+    "generate_until",
 ]
 
 
@@ -52,7 +52,9 @@ ALL_OUTPUT_TYPES = [
 class TaskConfig(dict):
     # task naming/registry
     task: str = None
+    task_alias: str = None
     group: Union[str, list] = None
+    group_alias: Union[str, list] = None
     # HF dataset options.
     # which dataset to use,
     # and what splits for what purpose
@@ -69,7 +71,6 @@ class TaskConfig(dict):
     doc_to_text: Union[Callable, str] = None
     doc_to_target: Union[Callable, str] = None
     doc_to_choice: Union[Callable, str, dict, list] = None
-    gold_alias: Union[Callable, str] = None
     process_results: Union[Callable, str] = None
     use_prompt: str = None
     description: str = ""
@@ -80,7 +81,7 @@ class TaskConfig(dict):
     num_fewshot: int = 0
     # scoring options
     metric_list: list = None
-    output_type: str = "greedy_until"
+    output_type: str = "generate_until"
     generation_kwargs: dict = None
     repeats: int = 1
     filter_list: Union[str, list] = None
@@ -97,11 +98,11 @@ class TaskConfig(dict):
             self.dataset_path = inspect.getfile(import_module(self.dataset_path))
 
         if self.generation_kwargs is not None:
-            if self.output_type != "greedy_until":
+            if self.output_type != "generate_until":
                 eval_logger.warning(
-                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: greedy_until`!"
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
                 )
-                assert self.output_type != "greedy_until"
+                assert self.output_type != "generate_until"
 
             if "temperature" in self.generation_kwargs:
                 self.generation_kwargs["temperature"] = float(
@@ -111,7 +112,7 @@ class TaskConfig(dict):
             if "until" not in self.generation_kwargs:
                 self.generation_kwargs["until"] = [self.fewshot_delimiter]
         else:
-            if self.output_type == "greedy_until":
+            if self.output_type == "generate_until":
                 # ensure that we greedily generate in absence of explicit arguments otherwise
                 self.generation_kwargs = {
                     "until": None
@@ -538,12 +539,14 @@ class ConfigurableTask(Task):
         self._aggregation_list = {}
         self._higher_is_better = {}
 
-        _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
         if self.config.metric_list is None:
             # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+
             for metric_name in _metric_list:
                 self._metric_fn_list[metric_name] = get_metric(metric_name)
-                self._aggregation_list[metric_name] = get_default_aggregation(
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
                     metric_name
                 )
                 self._higher_is_better[metric_name] = is_higher_better(metric_name)
@@ -586,7 +589,7 @@ class ConfigurableTask(Task):
                         ]
                 else:
                     INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
-                    metric_agg = get_default_aggregation(metric_name)
+                    metric_agg = get_metric_aggregation(metric_name)
                     eval_logger.warning(
                         f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
                         f"using default "
@@ -687,7 +690,10 @@ class ConfigurableTask(Task):
             for choice in check_choices:
                 choice_has_whitespace = True if choice[0].isspace() else False
                 delimiter_has_whitespace = (
-                    True if self.config.target_delimiter[-1].isspace() else False
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
                 )
 
                 if delimiter_has_whitespace and choice_has_whitespace:
@@ -696,7 +702,7 @@ class ConfigurableTask(Task):
                     )
                 elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
                     eval_logger.warning(
-                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                     )
 
     def download(self, dataset_kwargs=None) -> None:
@@ -888,26 +894,6 @@ class ConfigurableTask(Task):
         else:
             raise TypeError
 
-    def gold_alias(self, doc):
-        # returns a version of the gold target answer to a document,
-        # which should be passed into metric for scoring as the ground truth.
-
-        # in multiple_choice tasks, this should be castable to an int corresponding to the index
-        # within the answer choices, while doc_to_target is the string version of {{answer_choices[gold]}}.
-        if self.config.gold_alias is not None:
-            doc_to_target = self.config.gold_alias
-        else:
-            return self.doc_to_target(doc)
-
-        if type(doc_to_target) == str:
-            return utils.apply_template(doc_to_target, doc)
-        elif callable(doc_to_target):
-            return doc_to_target(doc)
-        elif hasattr(doc_to_target, "apply"):
-            return doc_to_target.apply(doc)[1]
-        else:
-            raise TypeError
-
     def construct_requests(
         self, doc: dict, ctx: str, **kwargs
     ) -> Union[List[Instance], Instance]:
@@ -958,7 +944,7 @@ class ConfigurableTask(Task):
                 )
             return request_list
 
-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
             arguments = (ctx, self.config.generation_kwargs)
 
         return Instance(
@@ -1055,12 +1041,21 @@ class ConfigurableTask(Task):
                 # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
                 exact_match = int(is_greedy[gold]) if gold != -100 else 0
 
+            prob_norm = utils.softmax(lls)
+
+            # TODO use keyword arguments to the metric?
+            # gold, pred, norm stuff, the original lls,
             result_dict = {
                 **({"acc": acc} if "acc" in use_metric else {}),
                 **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                 **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                 **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
                 **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **(
+                    {"brier_score": (gold, prob_norm)}
+                    if "brier_score" in use_metric
+                    else {}
+                ),
             }
 
             if "acc_mutual_info" in use_metric:
@@ -1070,7 +1065,7 @@ class ConfigurableTask(Task):
                 acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
                 result_dict["acc_mutual_info"] = acc_mutual_info
 
-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
             gold = self.doc_to_target(doc)
             result = results[0]
             if self.config.doc_to_choice is not None:
@@ -1134,7 +1129,7 @@ class ConfigurableTask(Task):
         else:
             raise ValueError(
                 f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
             )
 
         return result_dict
diff --git a/lm_eval/benchmarks/__init__.py b/lm_eval/benchmarks/__init__.py
deleted file mode 100644
index e87ad788fba09f16a10e3eb142549a1e5f2e499a..0000000000000000000000000000000000000000
--- a/lm_eval/benchmarks/__init__.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-import yaml
-
-from lm_eval import utils
-from lm_eval.tasks import register_configurable_task, check_prompt_config
-from lm_eval.logger import eval_logger
-from lm_eval.api.registry import (
-    TASK_REGISTRY,
-    GROUP_REGISTRY,
-    ALL_TASKS,
-)
-
-
-def include_benchmarks(task_dir: str) -> None:
-    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    try:
-                        benchmark_path = os.path.join(root, f)
-
-                        with open(benchmark_path, "rb") as file:
-                            yaml_config = yaml.full_load(file)
-
-                        if "prompts" in yaml_config:
-                            continue  # Skip it
-
-                        assert "group" in yaml_config
-                        group = yaml_config["group"]
-                        all_task_list = yaml_config["task"]
-                        config_list = [
-                            task for task in all_task_list if type(task) != str
-                        ]
-                        task_list = [
-                            task for task in all_task_list if type(task) == str
-                        ]
-
-                        for task_config in config_list:
-                            yaml_dir = os.path.dirname(benchmark_path)
-                            task_config = utils.load_yaml_config(
-                                yaml_config=task_config, yaml_dir=yaml_dir
-                            )
-                            if "use_prompt" in task_config:
-                                if "yaml" in task_config["use_prompt"]:
-                                    task_config["use_prompt"] = os.path.join(
-                                        root, task_config["use_prompt"]
-                                    )
-
-                            var_configs = check_prompt_config(
-                                {
-                                    **task_config,
-                                    **{"group": group},
-                                }
-                            )
-                            for config in var_configs:
-                                register_configurable_task(config)
-
-                        task_names = utils.pattern_match(task_list, ALL_TASKS)
-                        for task in task_names:
-                            if task in TASK_REGISTRY:
-                                if group in GROUP_REGISTRY:
-                                    GROUP_REGISTRY[group].append(task)
-                                else:
-                                    GROUP_REGISTRY[group] = [task]
-                                    ALL_TASKS.add(group)
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load benchmark in\n"
-                            f"                                 {benchmark_path}\n"
-                            "                                 Benchmark will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-
-
-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_benchmarks(task_dir)
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index bf35097c5bdc8bc7d3392c1c50b589cd6e97cb5b..3fa9633ecb73b8635e6e2746c95f1a0c65a9b532 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -2,7 +2,6 @@ import random
 import itertools
 import json
 import collections
-import logging
 import sys
 
 import torch
@@ -25,10 +24,6 @@ from lm_eval.utils import (
 
 from lm_eval.logger import eval_logger
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stdout))
-
 
 @positional_deprecated
 def simple_evaluate(
@@ -221,14 +216,15 @@ def evaluate(
     task_hierarchy = collections.defaultdict(list)
     # store the ordering of tasks and groups
     task_order = collections.defaultdict(int)
-    # store the aggregation for aggregating across tasks in the same group
-    sample_agg_fn = collections.defaultdict(dict)
+    task_group_alias = collections.defaultdict(dict)
 
     # get lists of each type of request
     for task_name, task in task_dict.items():
         if type(task) == tuple:
             group_name, task = task
             task_hierarchy[group_name].append(task_name)
+            versions[group_name] = "N/A"
+
         else:
             task_hierarchy[task_name] = []
 
@@ -238,6 +234,14 @@ def evaluate(
         versions[task_name] = task.VERSION
         configs[task_name] = dict(task.dump_config())
 
+        if "task_alias" in configs[task_name]:
+            task_group_alias[task_name] = configs[task_name]["task_alias"]
+
+        if ("group_alias" in configs[task_name]) and (
+            group_name not in task_group_alias
+        ):
+            task_group_alias[group_name] = configs[task_name]["group_alias"]
+
         if limit is not None:
             if task.has_test_docs():
                 task_docs = task.test_docs()
@@ -449,23 +453,8 @@ def evaluate(
                 group_name = None
 
             agg_fn = task.aggregation()[metric]
-            task_score = agg_fn(items)
-
-            if group_name is not None:
-                sample_metric_key = metric + "(sample agg)," + key
-                for grouping in task_to_group[task_name]:
-                    if metric_key in results[grouping]:
-                        results[grouping][metric_key].append(task_score)
-                    else:
-                        results[grouping][metric_key] = [task_score]
-
-                    if sample_metric_key in results[grouping]:
-                        results[grouping][sample_metric_key] += items
-                    else:
-                        results[grouping][sample_metric_key] = items.copy()
-                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
-
-            results[task_name][metric_key] = task_score
+            results[task_name][metric_key] = agg_fn(items)
+            results[task_name]["samples"] = len(items)
 
             # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
             # so we run them less iterations. still looking for a cleaner way to do this
@@ -481,33 +470,139 @@ def evaluate(
                     results[task_name][metric + "_stderr" + "," + key] = stderr(items)
 
         if bool(results):
-            for task_or_group in results.keys():
-                for metric in results[task_or_group].keys():
-                    if type(results[task_or_group][metric]) == list:
-                        if "(sample agg)" in metric:
-                            results[task_or_group][metric] = sample_agg_fn[
-                                task_or_group
-                            ][metric](results[task_or_group][metric])
-                        else:
-                            results[task_or_group][metric] = np.average(
-                                results[task_or_group][metric]
-                            )
-                        versions[task_or_group] = "N/A"
 
-        for task_name, task in task_dict.items():
-            if type(task) == tuple:
-                group_name, task = task
+            for group, task_list in reversed(task_hierarchy.items()):
+
+                if task_list == []:
+                    total_size = results[group]["samples"]
+                else:
+                    total_size = 0
+
+                    for task in task_list:
+                        metrics = results[task]
+
+                        current_size = metrics.pop("samples")
+                        # TODO: There should be a way for users
+                        #       to toggle between weighted and
+                        #       unweighted averaging
+                        # For unweighted averaging, use:
+                        #     current_size = 1
+
+                        all_stderr = []
+                        for metric in [
+                            key for key in metrics.keys() if "_stderr" not in key
+                        ]:
+
+                            stderr = "_stderr,".join(metric.split(","))
+                            stderr_score = results[task][stderr]
+                            var_score = stderr_score**2
+                            metric_score = results[task][metric]
+
+                            all_stderr.append(stderr)
+
+                            if metric in results[group]:
+                                results[group][metric] = (
+                                    results[group][metric] * total_size
+                                    + metric_score * current_size
+                                ) / (total_size + current_size)
+                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
+                                results[group][stderr] = (
+                                    (total_size - 1) * results[group][stderr]
+                                    + (current_size - 1) * var_score
+                                ) / (
+                                    total_size + current_size - 1
+                                ) + total_size * current_size / (
+                                    (total_size + current_size)
+                                    * (total_size + current_size - 1)
+                                ) * (
+                                    results[group][metric] - metric_score
+                                ) ** 2
+                            else:
+                                results[group][metric] = metric_score
+                                results[group][stderr] = var_score
+
+                        total_size += current_size
+
+                    for stderr in all_stderr:
+                        results[group][stderr] = np.sqrt(results[group][stderr])
+
+                results[group]["samples"] = total_size
+
+        def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
+
+            results_agg = collections.defaultdict(dict)
+            groups_agg = collections.defaultdict(dict)
+            for group_name, task_list in task_hierarchy.items():
+
                 order = task_order[group_name]
-                tabbed_name = "-" * order + group_name
-                results_agg[tabbed_name] = results[group_name]
-                versions[tabbed_name] = versions[group_name]
-                if order == 0:
-                    groups_agg[group_name] = results[group_name]
-
-            order = task_order[task_name]
-            tabbed_name = "-" * order + task_name
-            results_agg[tabbed_name] = results[task_name]
-            versions[tabbed_name] = versions[task_name]
+                results_agg[group_name] = results[group_name].copy()
+                results_agg[group_name]["tab"] = order
+
+                if (order < max(task_order.values())) and (len(task_list) > 0):
+                    groups_agg[group_name] = results[group_name].copy()
+                    groups_agg[group_name]["tab"] = order
+
+                if task_list != []:
+                    for task in sorted(task_list):
+                        if task in task_hierarchy:
+                            _task_hierarchy = {task: task_hierarchy[task]}
+                        else:
+                            _task_hierarchy = {task: []}
+
+                        _results_agg, _groups_agg, task_version = print_tasks(
+                            _task_hierarchy, task_order, task_version, task_group_alias
+                        )
+
+                        results_agg = {**results_agg, **_results_agg}
+                        groups_agg = {**groups_agg, **_groups_agg}
+
+            return results_agg, groups_agg, task_version
+
+        results_agg, groups_agg, versions = print_tasks(
+            task_hierarchy, task_order, versions, task_group_alias
+        )
+
+        _results_agg = collections.defaultdict(dict)
+        _versions = collections.defaultdict(dict)
+        for task in results_agg:
+            task_results = results_agg[task]
+
+            if "samples" in task_results:
+                task_results.pop("samples")
+
+            tab_string = ""
+            if "tab" in task_results:
+                tab = task_results.pop("tab")
+                tab_string = " " * tab + "- " if tab > 0 else ""
+
+            if task in task_group_alias:
+                task_alias = task_group_alias[task]
+                _results_agg[tab_string + task_alias] = task_results
+                _versions[tab_string + task_alias] = versions[task]
+            else:
+                _results_agg[tab_string + task] = task_results
+                _versions[tab_string + task] = versions[task]
+        results_agg = _results_agg
+        versions = _versions
+
+        _groups_agg = collections.defaultdict(dict)
+        for group in groups_agg:
+            group_results = groups_agg[group]
+
+            if "samples" in group_results:
+                group_results.pop("samples")
+
+            tab_string = ""
+            if "tab" in group_results:
+                tab = group_results.pop("tab")
+                tab_string = " " * tab + "- " if tab > 0 else ""
+
+            if group in task_group_alias:
+                group_alias = task_group_alias[group]
+                _groups_agg[tab_string + group_alias] = group_results
+            else:
+                _groups_agg[tab_string + group] = group_results
+        groups_agg = _groups_agg
 
         results_dict = {
             "results": dict(results_agg.items()),
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index 953ea91377761b67d9f555ad9c88b16f090b4064..be144b1696d1d238d2e1d818516a7602367a75cb 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
     def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
         raise NotImplementedError("No support for logits.")
 
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         if not requests:
             return []
 
@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
                 )
                 res.append(response)
 
-                self.cache_hook.add_partial("greedy_until", request, response)
+                self.cache_hook.add_partial("generate_until", request, response)
             except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
                 eval_logger.critical(f"Server unreachable: {e.__cause__}")
                 break
@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
 
     def loglikelihood(self, requests):
diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py
index 0264e763c88d0faaca0d21235af9acb9f0b7edbc..b13a3900f9db6929267940fc03dad91859759759 100644
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -20,7 +20,7 @@ class DummyLM(LM):
 
         return res
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = []
 
         for ctx, _ in requests:
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 57dae97498e60708e85450c8f364da1835c0afab..8feb8cfad334122f1bd587d1723dd5884659b0ee 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -621,6 +621,23 @@ class HFLM(LM):
 
         return loglikelihoods
 
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+
     def _loglikelihood_tokens(
         self, requests, disable_tqdm: bool = False, override_bs=None
     ):
@@ -644,38 +661,22 @@ class HFLM(LM):
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
 
-        def _batch_scheduler(pos):
-            sched = pos // int(n_reordered_requests / self.batch_schedule)
-            if sched in self.batch_sizes:
-                return self.batch_sizes[sched]
-            if (len(self.batch_sizes) > 1) and (
-                self.batch_sizes[sched - 1] == self.max_batch_size
-            ):
-                # if previous batch size is already maximal, skip recomputation
-                self.batch_sizes[sched] = self.max_batch_size
-                return self.batch_sizes[sched]
-            print(
-                f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
-            )
-            self.batch_sizes[sched] = self._detect_batch_size(
-                re_ord.get_reordered(), pos
-            )
-            print(f"Determined largest batch size: {self.batch_sizes[sched]}")
-            return self.batch_sizes[sched]
-
-        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
+        chunks = utils.chunks(
+            re_ord.get_reordered(),
             n=self.batch_size
             if self.batch_size != "auto"
             else override_bs
             if override_bs is not None
             else 0,
-            fn=_batch_scheduler
+            fn=self._batch_scheduler
             if self.batch_size == "auto"
             and n_reordered_requests > 0
             and not override_bs
             else None,
-        ):
+        )
+
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
+        for chunk in chunks:
             inps = []
             cont_toks_list = []
             inplens = []
@@ -812,10 +813,13 @@ class HFLM(LM):
                 res.append(answer)
 
                 self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+
+        pbar.close()
 
         return re_ord.get_original(res)
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = defaultdict(list)
         re_ords = {}
 
@@ -838,13 +842,26 @@ class HFLM(LM):
             re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
 
         pbar = tqdm(total=len(requests), disable=(self.rank != 0))
-
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
         # for each different set of kwargs, we execute all requests, by batch.
         for key, re_ord in re_ords.items():
-            for chunk in utils.chunks(
+            chunks = utils.chunks(
                 re_ord.get_reordered(),
-                self.batch_size,
-            ):
+                n=self.batch_size
+                if self.batch_size != "auto"
+                else adaptive_batch_size
+                if adaptive_batch_size is not None
+                else 0,
+                fn=self._batch_scheduler
+                if self.batch_size == "auto" and not adaptive_batch_size
+                else None,
+            )
+            for chunk in chunks:
                 contexts, all_gen_kwargs = zip(*chunk)
                 # we assume all gen kwargs in the batch are the same
                 # this is safe to assume because the `grouper` object ensures it.
@@ -920,7 +937,7 @@ class HFLM(LM):
                     res[key].append(s)
 
                     self.cache_hook.add_partial(
-                        "greedy_until", (context, gen_kwargs), s
+                        "generate_until", (context, gen_kwargs), s
                     )
                     pbar.update(1)
             # reorder this group of results back to original unsorted form
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index eb05dd4c32b6f507daf75914c6a886d3ec0ad474..1a06d85ad4fdc034fd4f6b5585c09f5d1c64882a 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
                     self.cache_hook.add_partial("loglikelihood", cache_key, answer)
         return re_ord.get_original(res)
 
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
         if not requests:
             return []
         res = []
@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):
 
                 # partial caching
                 self.cache_hook.add_partial(
-                    "greedy_until", (context, {"until": until_}), s
+                    "generate_until", (context, {"until": until_}), s
                 )
 
                 res.append(s)
@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
 
     def loglikelihood_rolling(self, requests) -> List[float]:
diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py
index a8fcfb9c9fe48d66c95fe68058c1de57d73e7e27..379f11b9023d7609cf964f6284e4a52d7ab05457 100644
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -58,7 +58,7 @@ class TextSynthLM(LM):
 
     @property
     def eot_token_id(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     @property
@@ -72,20 +72,20 @@ class TextSynthLM(LM):
 
     @property
     def batch_size(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     @property
     def device(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def tok_encode(self, string: str):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def tok_decode(self, tokens):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
         raise NotImplementedError()
 
     def loglikelihood(self, requests):
@@ -122,7 +122,7 @@ class TextSynthLM(LM):
             "input tokenization support from TextSynth."
         )
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         if not requests:
             return []
 
@@ -146,7 +146,7 @@ class TextSynthLM(LM):
                 s = resp["text"]
                 res.append(s)
 
-                self.cache_hook.add_partial("greedy_until", (inp, request_args), s)
+                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
             else:
                 logger.error(
                     f"The following response does not contain generated `text`. "
@@ -160,5 +160,5 @@ class TextSynthLM(LM):
         raise NotImplementedError()
 
     def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
         raise NotImplementedError()
diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index b95012b5f3b8f82f7500c493f2e668b854ac8960..17521221a268d73b00b08bc2a2bcbcb4f1088138 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] MGSM
 - [ ] SCROLLS
 - [x] Babi
+- [x] Belebele
 
 # Novel Tasks
 Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 0b124a678b86e5c5fc341737398aaf5537f7599f..d36b6077d0a551c691c7a85d1654fd7243818e04 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,7 +4,6 @@ from typing import List, Union, Dict
 
 from lm_eval import utils
 from lm_eval import prompts
-from lm_eval.logger import eval_logger
 from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
 from lm_eval.api.registry import (
     register_task,
@@ -14,6 +13,10 @@ from lm_eval.api.registry import (
     ALL_TASKS,
 )
 
+import logging
+
+eval_logger = logging.getLogger("lm-eval")
+
 
 def register_configurable_task(config: Dict[str, str]) -> int:
     SubClass = type(
@@ -98,7 +101,7 @@ def check_prompt_config(
                             ]
                         )
                     },
-                    **{"output_type": "greedy_until"},
+                    **{"output_type": "generate_until"},
                 }
             )
     else:
@@ -145,7 +148,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
                 except Exception as error:
                     import traceback
 
-                    eval_logger.warning(
+                    eval_logger.debug(
                         "Failed to load config in\n"
                         f"                                 {yaml_path}\n"
                         "                                 Config will not be added to registry\n"
diff --git a/lm_eval/tasks/babi/babi.yaml b/lm_eval/tasks/babi/babi.yaml
index 1b10cc004ccae97098c1e645917c609297cc48a9..5181b2a1a714230141b996ed6e00fd03e4f78364 100644
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -1,7 +1,7 @@
 task: babi
 dataset_path: Muennighoff/babi
 dataset_name: null
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: valid
 test_split: test
diff --git a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
index 2e2e8bc947fc8f8b3edd1d250e72b270b1e179ae..b96dd712fb43915ef33d1ecd3dae29590f77abc2 100644
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_cot_fewshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
index 7ccf3699e18197c0c60d3cede8dba16c7b9f1082..8bb0f2b714c6158e9717bc21f1db7368d3938b43 100644
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_cot_zeroshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
index 89e5de29fbd6bfa6442a4a1ad504c46f72ece0b8..93503989875d7a6b7b25f05e840013f94f9ec359 100644
--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_fewshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
index 66dbf3690bda2a763d7511ea3ac85ed905efff54..cd5bafe84f0c7a873a1b8a0d53d5a3e0808fa578 100644
--- a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
@@ -1,6 +1,6 @@
 group: bbh_flan_zeroshot
 dataset_path: lukaemon/bbh
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_target: "{{target}}"
 metric_list:
diff --git a/lm_eval/tasks/belebele/README.md b/lm_eval/tasks/belebele/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e08e63e8a11b563dad771f8ee99a1da167e55016
--- /dev/null
+++ b/lm_eval/tasks/belebele/README.md
@@ -0,0 +1,49 @@
+# Belebele
+
+### Paper
+
+The Belebele Benchmark for Massively Multilingual NLU Evaluation
+https://arxiv.org/abs/2308.16884
+
+Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is reinforced by extensive quality checks. While all questions directly relate to the passage, the English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. Belebele opens up new avenues for evaluating and analyzing the multilingual abilities of language models and NLP systems.
+
+Homepage: https://github.com/facebookresearch/belebele
+
+### Citation
+
+```bibtex
+@misc{bandarkar2023belebele,
+      title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
+      author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
+      year={2023},
+      eprint={2308.16884},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `belebele`: All 122 languages of the Belebele dataset, evaluated following the methodology in MMLU's original implementation.
+
+#### Tasks
+
+
+The following tasks evaluate languages in the Belebele dataset using loglikelihood-based multiple-choice scoring:
+- `belebele_{language}`
+
+The variant evaluated here is the 0-shot or few-shot evaluation with English Instructions.
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06fe81155ecb399d4e46e2f70a32b256a5bb094b
--- /dev/null
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -0,0 +1,19 @@
+group: belebele
+dataset_path: facebook/belebele
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df56f5feb39302f59f2959e2dd95ed3d27fbaf1
--- /dev/null
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -0,0 +1,64 @@
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+import requests
+
+from tqdm import tqdm
+
+from lm_eval.logger import eval_logger
+
+API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="belebele")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
+    def query():
+        response = requests.get(API_URL)
+        return response.json()["splits"]
+
+    languages = [split["config"] for split in query()]
+
+    for lang in tqdm(languages):
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"belebele_{args.task_prefix}_{lang}"
+            if args.task_prefix != ""
+            else f"belebele_{lang}",
+            "dataset_name": lang,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
+        eval_logger.info(f"Saving yaml for subset {lang} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
diff --git a/lm_eval/tasks/belebele/belebele_acm_Arab.yaml b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7afe81f447b9eef09a18bca5c8a95e98fc382a1d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "acm_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_acm_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_afr_Latn.yaml b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ced8ffcfcffca316466334fbda39042a2717121
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "afr_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_afr_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_als_Latn.yaml b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..507fe758a349a250fbeac3d3bde18051a031e6bf
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "als_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_als_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0b4bd0acc89ffacc2723c8b867487233f116e59
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "amh_Ethi"
+"include": "_default_template_yaml"
+"task": "belebele_amh_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_apc_Arab.yaml b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5d6777f530167259acfba4bd217e8e65c9b13fb
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "apc_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_apc_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Arab.yaml b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a61d78ed0bf1eb1bf8bd5ef6c12b6d9101ac1159
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "arb_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arb_Latn.yaml b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..976846541269268c66f5903a1483ae0cc2c4eaac
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "arb_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ars_Arab.yaml b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac322a6bdf42f0e89d67323469606eb5d8c8ba7a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ars_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_ars_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_ary_Arab.yaml b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78bb8e29b89282e7cea94ec618440d897298b833
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ary_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_ary_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_arz_Arab.yaml b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28d8565fd2aa9534f94702b31f6f828553307aef
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "arz_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_arz_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_asm_Beng.yaml b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fcc708dfa52eec48ffcf0cec864b5fadba162213
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_asm_Beng.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "asm_Beng"
+"include": "_default_template_yaml"
+"task": "belebele_asm_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_azj_Latn.yaml b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5add1b77dbd88e259953a1b7233c9be60eb05fe
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_azj_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "azj_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_azj_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bam_Latn.yaml b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c2585d738925502b3295f10c8b8d2f710e620f2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bam_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "bam_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_bam_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Beng.yaml b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62e9ea10cb7d0773a52581613c0304eaf5a8a525
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ben_Beng.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ben_Beng"
+"include": "_default_template_yaml"
+"task": "belebele_ben_Beng"
diff --git a/lm_eval/tasks/belebele/belebele_ben_Latn.yaml b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ed8d7fab60b368619abbb863c56bdb572986b7f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ben_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ben_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ben_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8cf0464e02e46c7f2897a2d0a5ec355b35af3081
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "bod_Tibt"
+"include": "_default_template_yaml"
+"task": "belebele_bod_Tibt"
diff --git a/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a34c29e36f7bb1b96c817cbe0052e0175cdcf89c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "bul_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_bul_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_cat_Latn.yaml b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa6af71768bac6cb055bac7434d12041490ebfaf
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_cat_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "cat_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_cat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2d0a2dd59560e66819f4d7329ecda8aa8576740
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ceb_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ceb_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ces_Latn.yaml b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee7b6894041ea8c1eb66150799066b39df7ce71f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ces_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ces_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ces_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02277ba517b300cd88404d102518d4476198f5a4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ckb_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_ckb_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_dan_Latn.yaml b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45555e9db0af78c2d332d729c357cb432dfa32e9
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_dan_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "dan_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_dan_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_deu_Latn.yaml b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..543d32a815f88151fe43a4619ced8ab36b214220
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_deu_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "deu_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_deu_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ell_Grek.yaml b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b5bc5ad89153e56fe1a39fd91bc2094ea1854d4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ell_Grek.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ell_Grek"
+"include": "_default_template_yaml"
+"task": "belebele_ell_Grek"
diff --git a/lm_eval/tasks/belebele/belebele_eng_Latn.yaml b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f02cdeb949c441c95abe5c290aa7ee975e1ee88e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_eng_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "eng_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_eng_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_est_Latn.yaml b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d74778c7ca9dd93d5eb75f7a75f7bc5fb0b4be1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_est_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "est_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_est_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_eus_Latn.yaml b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18711684d0126480903a5d7f4a30257e1f21fd65
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_eus_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "eus_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_eus_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fin_Latn.yaml b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..898a6e7a86965739dcee8809f8ce473472aa94ac
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fin_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "fin_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_fin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fra_Latn.yaml b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..154b62d8538f71d4b3b5c2cdfc78ee8aad8b567d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fra_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "fra_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_fra_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8015f09047f793679ebcd49f739b1d580e1a6cc0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "fuv_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_fuv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c671796fde14da48440d9560b12b9f95805125a8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "gaz_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_gaz_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_grn_Latn.yaml b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbb2b8aa31c2bbe999a7760e2c5c5e8170ba1e4a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_grn_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "grn_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_grn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02e716d05c2913ace9b0b5125b85a0c8096d58e2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "guj_Gujr"
+"include": "_default_template_yaml"
+"task": "belebele_guj_Gujr"
diff --git a/lm_eval/tasks/belebele/belebele_hat_Latn.yaml b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..691da434af0491f8f819e28576e43184c0c91b46
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hat_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hat_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_hat_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hau_Latn.yaml b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff94e767080440916104933aab1492f3749daa9d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hau_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hau_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_hau_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b46a240f5c8b6377b9e70251400d6312bdf2d6db
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "heb_Hebr"
+"include": "_default_template_yaml"
+"task": "belebele_heb_Hebr"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Deva.yaml b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3e7c1aebca75bddc287569ee2a06794f8651fd4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hin_Deva.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hin_Deva"
+"include": "_default_template_yaml"
+"task": "belebele_hin_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_hin_Latn.yaml b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37085a323942fdf707dd0cde09b000a90368f657
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hin_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hin_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_hin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b501540fcce90f5f31adc776bbb802437b56479
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hrv_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_hrv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hun_Latn.yaml b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d211f591d41525bb83d15c486df51b736239d60
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hun_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hun_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_hun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_hye_Armn.yaml b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6752b2f789de1dd46a6116d94415e7016ccb36ee
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_hye_Armn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "hye_Armn"
+"include": "_default_template_yaml"
+"task": "belebele_hye_Armn"
diff --git a/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17e48a753b77adda4a118603aeb0bb68e9214485
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ibo_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ibo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..457aa2e14bd4018cf7cd678f59e15fc59667e91e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ilo_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ilo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ind_Latn.yaml b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c90532f41fdbeaf4ff005534db771f75a71494f1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ind_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ind_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ind_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_isl_Latn.yaml b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eece64e1dd7cb51aa7e3151acbb15f54c9713e4e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_isl_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "isl_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_isl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ita_Latn.yaml b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac958a6a5b73525de162881794c702f7ddc4e26d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ita_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ita_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ita_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jav_Latn.yaml b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57435d1abc4be21173a07c9f2a7afa136f25b96a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_jav_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "jav_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_jav_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6d0945191d5868a9103cd44e655bf73937792a0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "jpn_Jpan"
+"include": "_default_template_yaml"
+"task": "belebele_jpn_Jpan"
diff --git a/lm_eval/tasks/belebele/belebele_kac_Latn.yaml b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..090c135620e133ed9aa970811ff37ae6f7edcb9e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kac_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kac_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_kac_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kan_Knda.yaml b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0085fff81c30eafc6288224eaaa02d4ba8146dd4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kan_Knda.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kan_Knda"
+"include": "_default_template_yaml"
+"task": "belebele_kan_Knda"
diff --git a/lm_eval/tasks/belebele/belebele_kat_Geor.yaml b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b681bbabbe61eebbcfc3f23e3f78f2b79545968
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kat_Geor.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kat_Geor"
+"include": "_default_template_yaml"
+"task": "belebele_kat_Geor"
diff --git a/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70c7c155d35d166acec0bd948c040b50421aa1ff
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kaz_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_kaz_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kea_Latn.yaml b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aae7056834011c19fc93bcb874e61799dfb28e79
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kea_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kea_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_kea_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53d8839e5e0ab56d9547c690e7d6fb32d00b0af3
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "khk_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_khk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef388cd2de7c932af589a681cfc73aeda00167cc
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "khm_Khmr"
+"include": "_default_template_yaml"
+"task": "belebele_khm_Khmr"
diff --git a/lm_eval/tasks/belebele/belebele_kin_Latn.yaml b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edfeb80a64d714025952e63003ba49b1dee59c34
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kin_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kin_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_kin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6cb7a4d9ce98f7980e9bb8fde3e0e0e1901f662
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kir_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_kir_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_kor_Hang.yaml b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ece7f55ec4c1ac8edba94503f6b6885cb0c529cd
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_kor_Hang.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "kor_Hang"
+"include": "_default_template_yaml"
+"task": "belebele_kor_Hang"
diff --git a/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6012ac1c2fb0181c3b825b7cf0e3f5802f5483b1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "lao_Laoo"
+"include": "_default_template_yaml"
+"task": "belebele_lao_Laoo"
diff --git a/lm_eval/tasks/belebele/belebele_lin_Latn.yaml b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb33b8599db5e24a58385f50306c7ce3539993a5
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lin_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "lin_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_lin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lit_Latn.yaml b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9943bd0301a307a48c252c4925f707c9eeea2df
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lit_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "lit_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_lit_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lug_Latn.yaml b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19d4f056027f52a1ad78580b80e13c6f67f50060
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lug_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "lug_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_lug_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_luo_Latn.yaml b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73cc0aee12ea23a16fcc1d777c7336938d4009a0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_luo_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "luo_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_luo_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18d291e56182a1c4cd08eaecc6fdb39939ecf842
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "lvs_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_lvs_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..283c67b2296ab4addd05b0565de79b984fa74e77
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mal_Mlym"
+"include": "_default_template_yaml"
+"task": "belebele_mal_Mlym"
diff --git a/lm_eval/tasks/belebele/belebele_mar_Deva.yaml b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a103e88c4a4b803a772fab654f4b800bef78af0
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mar_Deva.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mar_Deva"
+"include": "_default_template_yaml"
+"task": "belebele_mar_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3a696b406df92e14cc8120e4aca08523250c180
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mkd_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_mkd_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2067469ab68bd0fc659c13b2d036795668e1ab66
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mlt_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_mlt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mri_Latn.yaml b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cdfb5a31bab49668cab277e689c3e37c0292b0a
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mri_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mri_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_mri_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02a632b827e4362a336b4abba85c29d5c96275ae
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "mya_Mymr"
+"include": "_default_template_yaml"
+"task": "belebele_mya_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_nld_Latn.yaml b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e32ebd4d634b2af912d6f1c19381ce45d286dd53
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nld_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "nld_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_nld_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nob_Latn.yaml b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29c690e555478c77a9866883bebd4d3b5cbbceca
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nob_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "nob_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_nob_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Deva.yaml b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe2a82264bff787657c8ff3080e1066a06f80ae1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_npi_Deva.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "npi_Deva"
+"include": "_default_template_yaml"
+"task": "belebele_npi_Deva"
diff --git a/lm_eval/tasks/belebele/belebele_npi_Latn.yaml b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60e088096fea8ea25ac178d5c89a477924b334a8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_npi_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "npi_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_npi_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nso_Latn.yaml b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7029428bc3bdc566dc61497927fed5458bb59179
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nso_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "nso_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_nso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_nya_Latn.yaml b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b648d75ec2e1865a1524bbcb3661655fc71703a6
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_nya_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "nya_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_nya_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ory_Orya.yaml b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a55ff01529d8ffe506c210b96e2794d3e39f37c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ory_Orya.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ory_Orya"
+"include": "_default_template_yaml"
+"task": "belebele_ory_Orya"
diff --git a/lm_eval/tasks/belebele/belebele_pan_Guru.yaml b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b61bc026ddd66194fd49f1bd655c58c4d315f812
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pan_Guru.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "pan_Guru"
+"include": "_default_template_yaml"
+"task": "belebele_pan_Guru"
diff --git a/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a0631e7bf313f8a67995addb98db55b9a2b0f5f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "pbt_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_pbt_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_pes_Arab.yaml b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70b5608c622c5dfb5e72aabf538c8e6dd3e9f7bd
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pes_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "pes_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_pes_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_plt_Latn.yaml b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ecd131441a6de86f00ddb5253107977e75e37096
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_plt_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "plt_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_plt_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_pol_Latn.yaml b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c3aedaed0a7f9b72577608cb01d28943fb2345e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_pol_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "pol_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_pol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_por_Latn.yaml b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b4636a84f16f5fe9cbef35d4918fde57eba3057
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_por_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "por_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_por_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ron_Latn.yaml b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5667ecbca0dea22621b7e41417a71468a04cd02c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ron_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ron_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ron_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17d3a351737f76e1727b0af8a42d65d81f3f01dc
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "rus_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_rus_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d19582f5e15019b119be2f3b7edc89444bf1af7b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "shn_Mymr"
+"include": "_default_template_yaml"
+"task": "belebele_shn_Mymr"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Latn.yaml b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b631eac2eeba90b2fb140d08690c33cc462a134
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sin_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "sin_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_sin_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b025120b8a40ff1942bb688224b27896ad2abb8f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "sin_Sinh"
+"include": "_default_template_yaml"
+"task": "belebele_sin_Sinh"
diff --git a/lm_eval/tasks/belebele/belebele_slk_Latn.yaml b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00a1c163e9f3011f110cf857e74f1e01bf57cd72
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_slk_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "slk_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_slk_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_slv_Latn.yaml b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3567fb5d269afa0803211ca23d4dac96b72896e2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_slv_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "slv_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_slv_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sna_Latn.yaml b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9f01e836421c573e4c8fb7612ae2d2bb554f81f
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sna_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "sna_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_sna_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_snd_Arab.yaml b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af16a289aad69092372e31806387e8b9b277bccd
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_snd_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "snd_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_snd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_som_Latn.yaml b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06aa53c6d631f086c4bbacae9fdb6d9cd9b23937
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_som_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "som_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_som_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sot_Latn.yaml b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb05d3cd79502e39e846231b6240187927980b1e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sot_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "sot_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_sot_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_spa_Latn.yaml b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0ba62eaeaab0817629d16c0d2bd1647eded66da
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_spa_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "spa_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_spa_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..42a20f6dd99cbc77a8a14256222b85665fab19e2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "srp_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_srp_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3655d7a1415b9e405a0562865162af043d0f7ac
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ssw_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_ssw_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_sun_Latn.yaml b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..710b87b970ead18114614a1f5891baca56d85cfa
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_sun_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "sun_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_sun_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swe_Latn.yaml b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df1e896aff547c0fd6d1a2c8dd1faee7fca7b4ce
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_swe_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "swe_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_swe_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_swh_Latn.yaml b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a006b91c1a3568a7a8bbf77083c011463f6ea0d
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_swh_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "swh_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_swh_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tam_Taml.yaml b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0965cbd56c5c3ac81646fb48d2db94ef604bdec2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tam_Taml.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tam_Taml"
+"include": "_default_template_yaml"
+"task": "belebele_tam_Taml"
diff --git a/lm_eval/tasks/belebele/belebele_tel_Telu.yaml b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ae5fad43bc18941c3afc04b995a48e317e2f13c
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tel_Telu.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tel_Telu"
+"include": "_default_template_yaml"
+"task": "belebele_tel_Telu"
diff --git a/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd6bf5b6c4e129eca82047d28ad67b36256e0fa4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tgk_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_tgk_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6410484a68becae671834f63950fee3dc7991640
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tgl_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_tgl_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tha_Thai.yaml b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3786ccf33f6559d046d7ea3b207a3c23bed3b87
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tha_Thai"
+"include": "_default_template_yaml"
+"task": "belebele_tha_Thai"
diff --git a/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..982ebb05c7af5ab5dfd005395d08d681b046da75
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tir_Ethi"
+"include": "_default_template_yaml"
+"task": "belebele_tir_Ethi"
diff --git a/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..026c20f5e600ecd80b78f50965b952c9fd4ffe55
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tsn_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_tsn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tso_Latn.yaml b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91c75d9717e15662c563b68d503c0e91defbabe1
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tso_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_tso_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_tur_Latn.yaml b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8f5946baf7f5a2305a59bc52016a31e0b1bc382
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "tur_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_tur_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f2474074efebd869e179f6e4fdfa742c56f9f10
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "ukr_Cyrl"
+"include": "_default_template_yaml"
+"task": "belebele_ukr_Cyrl"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Arab.yaml b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58a2016d144b740412f45ba7938b42356702709e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "urd_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_urd_Arab"
diff --git a/lm_eval/tasks/belebele/belebele_urd_Latn.yaml b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a618465b21db2a1a9a00e97156a70c72dda9de66
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "urd_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_urd_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c8c0567fb87f9b73cf815ad4ed9df8d35c459ea
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "uzn_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_uzn_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_vie_Latn.yaml b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c676ad90313a34bb1caba9693375156caebd7a2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "vie_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_vie_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_war_Latn.yaml b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b133e2469d597ae46fe9a5633cd070c810ae06e
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "war_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_war_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_wol_Latn.yaml b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67b0530c3306d6b926f3d8de7134b87b9bded2c4
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "wol_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_wol_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_xho_Latn.yaml b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7665f9b52cfa3517cdfa6eaad199c4b3ea4c6adf
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "xho_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_xho_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_yor_Latn.yaml b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e293145ff628549829dcb60dbf82988960ee65e8
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "yor_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_yor_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d5d175a75270da7f0fa99cd8edb99bfd5ec42d2
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "zho_Hans"
+"include": "_default_template_yaml"
+"task": "belebele_zho_Hans"
diff --git a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54fedc5d7e3f0fe65dcc3ddc077bf4cfc11ccd94
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "zho_Hant"
+"include": "_default_template_yaml"
+"task": "belebele_zho_Hant"
diff --git a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..616bcc0dfbda9a67493076521d72651434c244c3
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "zsm_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_zsm_Latn"
diff --git a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9da6f4d011df819b5aad0f84bcf4d336cd9313b
--- /dev/null
+++ b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
@@ -0,0 +1,3 @@
+"dataset_name": "zul_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_zul_Latn"
diff --git a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
index cbd4084949c74cd28e66a5cb1b336b1473add5a7..6e460a0e6779c754da4d6e10e1a2e3dc7e845da9 100644
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
@@ -1,5 +1,5 @@
 group: flan-cot
-output_type: greedy_until
+output_type: generate_until
 validation_split: validation
 doc_to_target: "{{answer}}"
 metric_list:
diff --git a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
index e09daca2c46200a18a66498c46c9f29aae3348e1..f5050a49b68c34b36df5f3b691caad2e7e377d01 100644
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -1,4 +1,4 @@
-output_type: greedy_until
+output_type: generate_until
 validation_split: validation
 metric_list:
   - metric: exact_match
diff --git a/lm_eval/benchmarks/minerva_math.yaml b/lm_eval/tasks/benchmarks/minerva_math.yaml
similarity index 100%
rename from lm_eval/benchmarks/minerva_math.yaml
rename to lm_eval/tasks/benchmarks/minerva_math.yaml
diff --git a/lm_eval/tasks/benchmarks/pythia.yaml b/lm_eval/tasks/benchmarks/pythia.yaml
index bb5b1174badf0bb36a242699ff1bf560a66c67ce..bdeadd3ce995ce3d4d9340082ede3bf424ba276d 100644
--- a/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/lm_eval/tasks/benchmarks/pythia.yaml
@@ -9,4 +9,4 @@ task:
   - wsc
   - ai2_arc
   - blimp
-  - hendrycksTest*
+  - mmlu
diff --git a/lm_eval/tasks/benchmarks/t0_eval.yaml b/lm_eval/tasks/benchmarks/t0_eval.yaml
index 788122e9fc7392e6517f8879cf0742a3cc996fb8..27e7adc41bd2eaffa20b3344cfdf83a52b4d65fc 100644
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
@@ -6,7 +6,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -19,7 +19,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -32,7 +32,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -44,7 +44,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -56,7 +56,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r1
     validation_split: dev_r1
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -68,7 +68,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r2
     validation_split: dev_r2
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -80,7 +80,7 @@ task:
     use_prompt: promptsource:*
     training_split: train_r3
     validation_split: dev_r3
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -93,7 +93,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -105,7 +105,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
@@ -118,7 +118,7 @@ task:
     use_prompt: promptsource:*
     training_split: train
     validation_split: validation
-    output_type: greedy_until
+    output_type: generate_until
     metric_list:
       - metric: exact_match
         aggregation: mean
diff --git a/lm_eval/tasks/bigbench/aux_metric.py b/lm_eval/tasks/bigbench/aux_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd87fbec6e5fd9c3048323d738f0c0019bd90cc8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/aux_metric.py
@@ -0,0 +1,14 @@
+from textdistance import levenshtein
+from transformers import AutoTokenizer
+
+# Change this tokenizer to fit with the model you are using.
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b", max_new_tokens=128)
+
+
+def token_edit_distance(references, predictions, **kwargs):
+    print(references)
+    print(predictions)
+    print("###")
+    ref_tokens = tokenizer.encode(references[0])
+    pred_tokens = tokenizer.encode(predictions[0])
+    return levenshtein.distance(ref_tokens, pred_tokens)
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index 00a8799eaa14f81021bcd21c96532c724cb15d9d..fa8619f40c7b2e357547b74e499e1e8dfdfe7689 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -175,8 +175,8 @@ all_subtasks = [
 def main() -> None:
 
     for path, task_type in zip(
-        ["multiple_choice", "greedy_until"],
-        ["multiple_choice_template_yaml", "greedy_until_template_yaml"],
+        ["multiple_choice", "generate_until"],
+        ["multiple_choice_template_yaml", "generate_until_template_yaml"],
     ):
         os.makedirs(path, exist_ok=True)
         for task in all_subtasks:
diff --git a/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dce5238b65beb5e1eb7d579f72abac0e91079984
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_abstract_narrative_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml b/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..831361984ab186fb29835595db2853469ee0f7e6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: anachronisms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_anachronisms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cc6550a6075a991bce4826c95188e0c7b3d2a94
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analogical_similarity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analogical_similarity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ae5cfe90f02a8154c49c23ff2aad2cbb40cbbc1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: analytic_entailment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analytic_entailment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml b/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6ae791f5f3b7057f4d7927a986ec57bc27cb7cb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: arithmetic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_arithmetic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60eaa0be986950cc508431170accc8a9ae644c36
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ascii_word_recognition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_ascii_word_recognition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d7510dfc80d4e52db0cc020f5f2abcdf9952795
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: authorship_verification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_authorship_verification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d90a0e7cc31f1c7a04f7b509a26513d6bdb22c00
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_categorization_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_categorization_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8802c1c85d3dd4ae02f04a86982b08be6e214e3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: auto_debugging_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_debugging_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6812f69961b8a0a57d86d98e40c5316484fb5623
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bbq_lite_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_bbq_lite_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28e7309f9f0e3ef74e662bdf0cd372c165400ee9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: bridging_anaphora_resolution_barqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_bridging_anaphora_resolution_barqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e1656800ad5d19d72508aaa35e68af0b55da624
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: causal_judgment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_causal_judgment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c34bfdc26ecc1dc3f2f8e023e13eefc85d3fad71
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cause_and_effect_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cause_and_effect_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0736f96ba0ca4bb0cd042ef325132b81a06f3d5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: checkmate_in_one_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_checkmate_in_one_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b3dde85706c6b50ca3c597443efb6686037fe8b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chess_state_tracking_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_chess_state_tracking_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..872e809b8637380fd3eafa0bb4a5a57e7ce6335c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: chinese_remainder_theorem_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_chinese_remainder_theorem_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a3b08ca6c4db099c156f4cc2277e408c8cee6a4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cifar10_classification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cifar10_classification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml b/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bd83353a5fcebc5abcded346ab4d38f26bbd7ee
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/code_line_description.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: code_line_description_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_code_line_description_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/codenames.yaml b/lm_eval/tasks/bigbench/generate_until/codenames.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e71510b4ba4215c91aca96d4a2c2d7fb676498e6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/codenames.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: codenames_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_codenames_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/color.yaml b/lm_eval/tasks/bigbench/generate_until/color.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18793a9977a0d84bf32470e1f5ba0493549e31fd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/color.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: color_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_color_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09a8b9f407385400214d48478a6e2cf9b24a70cc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: common_morpheme_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_common_morpheme_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b36c1d5c2a2ac9a6d6a0b633c2777135122610b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conceptual_combinations_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_conceptual_combinations_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec9cccc8c72e887e047a5871c496d68498f7f576
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: conlang_translation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_conlang_translation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e4da8946fd98ef021df67902ba5dc4857f34a227
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_contextual_parametric_knowledge_conflicts_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b551e5d8aa4e8963fbcb6f6476c76c0db64b609
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crash_blossom_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_crash_blossom_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml b/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a65d1c334295ee8f3370305a7f563dd21c476680
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/crass_ai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: crass_ai_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_crass_ai_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5fc59ee24bb455dff7cb77cfdb73ad11b7f1f572
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryobiology_spanish_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cryobiology_spanish_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml b/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3393c36805d6b29cd3d59481b11c8b8dd45e2910
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cryptonite.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cryptonite_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cryptonite_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..938fc4aff312eabeda39e95f46eaa787f9526ef2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: cs_algorithms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_cs_algorithms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f13ec2a4a0fc2dd244aefb53cb7e409fdb2bdad1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dark_humor_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_dark_humor_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0fdca6abd643776f45e4bd7163fd0fbe01f6087f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/date_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: date_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_date_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b671d715e1fe69c06c20385bc07b493ecc4d4d6f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disambiguation_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_disambiguation_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30182d9d1f884411dff255d208fd5c999209b003
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: discourse_marker_prediction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_discourse_marker_prediction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c6b9567bef7165ab725f1286ea33b2c62c0fc48
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: disfl_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_disfl_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..814a95de6b16fb6ceb57cb9991bdec00bdffabb7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: dyck_languages_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_dyck_languages_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fe807bc645a88d7f2e87da1d094a2ec1bb51805
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: elementary_math_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_elementary_math_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af958389cb784df75e9a82573087903642cef6ab
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emoji_movie_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_emoji_movie_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3eafb81943aec74feb620500ba8281f62249873b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: emojis_emotion_prediction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_emojis_emotion_prediction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b26cbee762ba972b44d9404f421e975ee285487
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: empirical_judgments_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_empirical_judgments_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cdd014d9c64b37666cc54c9b7097941fcb2a54a2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_english_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e6da1e0ce03973656fdceb8854cf2b6adbeeedf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: english_russian_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_english_russian_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb2ecba07ebf5bd97f7482e1adb535e064f8a146
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_entailed_polarity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aba850d30fb5bc2e120aabd616663cbcd04f8488
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: entailed_polarity_hindi_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_entailed_polarity_hindi_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f080bcf3988c2dcbcee08bae53025f6ce18ece13
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: epistemic_reasoning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_epistemic_reasoning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b640b9430ad8a11758152c63ad0c77497fd16d50
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: evaluating_information_essentiality_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_evaluating_information_essentiality_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml b/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62dd5197439239a86c7d044d28fd936226481a02
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/fact_checker.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fact_checker_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_fact_checker_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b94f4c05b924d9ca001addc50ba76a03fc3a32f7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fantasy_reasoning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_fantasy_reasoning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..718837f1c086b955d97d5ab0661dc350d482ae20
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: few_shot_nlg_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_few_shot_nlg_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ffbb5f60f4fac85de3847fb7c26c0b5b98403409
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: figure_of_speech_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_figure_of_speech_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3afc0edf2efd7056f8d46ad0d85ae55c7073be8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: formal_fallacies_syllogisms_negation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_formal_fallacies_syllogisms_negation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gem.yaml b/lm_eval/tasks/bigbench/generate_until/gem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f59f287869076ebf202cbf4f01d52b2935f87820
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gem.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gem_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gem_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12dd01b8b299a1fd703c8853653eea979543b0a5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gender_inclusive_sentences_german_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gender_inclusive_sentences_german_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c0a2ea65470661e5e8822ac7b46e89d01bdebca
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: general_knowledge_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_general_knowledge_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d586c3cb372b95a43243c59e6e7abc04f61f6513
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: geometric_shapes_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_geometric_shapes_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22748246128e774650563a8652a94d57b0e5a338
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: goal_step_wikihow_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_goal_step_wikihow_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..449b09c47ed4638e2773772b0ce27264cd694be0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: gre_reading_comprehension_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_gre_reading_comprehension_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5c437a4ad0322775013c80ff48cd1d875eb2cff
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hhh_alignment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hhh_alignment_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..463450b0cb275e2ea6391eb5bed44782ad3265da
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindi_question_answering_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hindi_question_answering_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fef48a443c5256290c90650834832ebf2008000
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hindu_knowledge_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hindu_knowledge_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ad63dda3e7cd433a29e34282ceaec71f188fa76
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hinglish_toxicity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hinglish_toxicity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2334fd6dc7d0a02751be1672d5f21eed837cb07b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: human_organs_senses_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_human_organs_senses_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e428c2a5304d43efc1b00ff53e8d3de493c115b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hyperbaton_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_hyperbaton_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d0028e03dcb1af695e98fdb619c7d6d101e290c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_math_theorems_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_identify_math_theorems_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4e1f9aa86cd9e29ad5109673b767dc33bde1e00
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: identify_odd_metaphor_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_identify_odd_metaphor_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/implicatures.yaml b/lm_eval/tasks/bigbench/generate_until/implicatures.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf19c32aad8960cc8427d7269927fd67ae732f14
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/implicatures.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicatures_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_implicatures_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..361f0435ef63a75bd2413100c434841f206ee5f8
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: implicit_relations_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_implicit_relations_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0583a17e4b456ca0d6334353fc16d8e89e95b962
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intent_recognition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_intent_recognition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1497c7802888d83da4c99cb1c0845e15da887584
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_nli_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_international_phonetic_alphabet_nli_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71ad3b9d4a7f980529e64ce4ebba38a4db026f05
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: international_phonetic_alphabet_transliterate_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_international_phonetic_alphabet_transliterate_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f2868a4a7c7345f4fe40047e1ecb4e06a53e3ee
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: intersect_geometry_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_intersect_geometry_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml b/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..556c5a62a7e31b56732dd158efca9111fa2b8f60
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/irony_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: irony_identification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_irony_identification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9a8a5b86f69a9966116c203a114d2d0ca5428e7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kanji_ascii_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_kanji_ascii_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/kannada.yaml b/lm_eval/tasks/bigbench/generate_until/kannada.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..047e7049b4a5adb0f4a16d31f0018ece6be0e72e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/kannada.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kannada_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_kannada_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ea697d1f7664866050ecbd0615ea3e957a13602
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: key_value_maps_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_key_value_maps_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1a8bb0640198dd3a1e288e80804a9dadeb2c806
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: known_unknowns_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_known_unknowns_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/language_games.yaml b/lm_eval/tasks/bigbench/generate_until/language_games.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..560223007d7670499ec5064dddf200c0a252fc89
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/language_games.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_games_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_language_games_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/language_identification.yaml b/lm_eval/tasks/bigbench/generate_until/language_identification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cb7b27408b9a82c308ebac33b89e799df0763a0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/language_identification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: language_identification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_language_identification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc351ce11290861bdf9d9ce71fb46ee832282265
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistic_mappings_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_linguistic_mappings_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df8b729a6bad1ee9c30bd57f659f9f61d0e840e4
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: linguistics_puzzles_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_linguistics_puzzles_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/list_functions.yaml b/lm_eval/tasks/bigbench/generate_until/list_functions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..658630ac7a0ba0e0dfbc7c86e08a518866e6746c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/list_functions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: list_functions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_list_functions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa8f2c2fefbed31d42e61db5261810b49e7ff35e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logic_grid_puzzle_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logic_grid_puzzle_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_args.yaml b/lm_eval/tasks/bigbench/generate_until/logical_args.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e85c142962ef552e5727de69763c01c912ac5716
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_args.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_args_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_args_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8fdaac7ffbe019507c5c0ed588df162538aaadc6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_deduction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_deduction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a74d11ea422980037b47c95d8f7aad02f7a76f5e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_fallacy_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_fallacy_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b55c057b1e67b4d5af232a9f9710dbbd56f10899
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: logical_sequence_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_logical_sequence_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..59e4fc3f2bb68f45cd4ff9c158b82b4cd0032241
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mathematical_induction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mathematical_induction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a162eae1b4226ba93f7dce1f0d8c46800512f9e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: matrixshapes_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_matrixshapes_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28922b3f1b498e073db5835c94bf3ee03fa07ebd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_boolean_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_metaphor_boolean_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..029a4c0a073ccaefc8975ae37937319b27f1e7ee
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: metaphor_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_metaphor_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d453fd941b840482073260cb55a095d4534baeeb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: minute_mysteries_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_minute_mysteries_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml b/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3375eb60927e49931f96289b8ddb6b0f2a3d002
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/misconceptions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_misconceptions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5e5e102ae68e5c472cfb368652064f4f67259fe
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: misconceptions_russian_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_misconceptions_russian_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db7ce738e76e2de4b5af98a034e517f48ed493e7
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mnist_ascii_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mnist_ascii_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edbb2b34b8cceb119a191942fb617cf99367cd40
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: modified_arithmetic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_modified_arithmetic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..277bf69feff29559672655e47ce037df3c42c454
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: moral_permissibility_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_moral_permissibility_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27cc6228f092b33652b0adcc5597fe71365128b0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_dialog_same_or_different_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_movie_dialog_same_or_different_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97c370ce883eaab2f9ea3abad34f08b2d1838b22
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: movie_recommendation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_movie_recommendation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..622c7ab13312abd8aa3d1ad7d932ce06b13b4ba5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: mult_data_wrangling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_mult_data_wrangling_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/multiemo.yaml b/lm_eval/tasks/bigbench/generate_until/multiemo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..465ccd0ce4f15270edcc4a4e2585764ee59d4e71
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/multiemo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: multiemo_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_multiemo_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b77c895577fa3894b4f6646702c7e237436864b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: natural_instructions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_natural_instructions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/navigate.yaml b/lm_eval/tasks/bigbench/generate_until/navigate.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..549ed37058fb3c2a9db7eb9d0d6e6ba4c2868983
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/navigate.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: navigate_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_navigate_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ed30902f6ec63439564b5e021807eb4ae672967
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: nonsense_words_grammar_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_nonsense_words_grammar_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12f388f8ef8164c30c0843d0a0cda59bc108d66d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: novel_concepts_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_novel_concepts_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/object_counting.yaml b/lm_eval/tasks/bigbench/generate_until/object_counting.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9fc956996d41418c40d23c255ba2abfd0a831b1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/object_counting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: object_counting_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_object_counting_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a58d7b5fb25068bb2149f4112355106f91fe263a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: odd_one_out_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_odd_one_out_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/operators.yaml b/lm_eval/tasks/bigbench/generate_until/operators.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6aaa8b61799f665645249c19d833593576709c6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/operators.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: operators_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_operators_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f982c5db5ccb458e9815708a26493f309ea436a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: paragraph_segmentation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_paragraph_segmentation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..552f8c6068fde183ab744a1e322c41c8744070e0
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_qa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_parsinlu_qa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..358184e11ced80305697c7e5f18317af2161bab9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: parsinlu_reading_comprehension_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_parsinlu_reading_comprehension_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6dc70030d9ef8ea5671a780bd88a186122a0fb47
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: penguins_in_a_table_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_penguins_in_a_table_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5c96cec606f6ba3e749c970b20f71d9ed200799
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: periodic_elements_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_periodic_elements_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e3aa0f47f46229e09b8d9bee0805eb4bbf5b671
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: persian_idioms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_persian_idioms_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..037da053e4e5ed3869f75976ebca9ae81d394314
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: phrase_relatedness_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_phrase_relatedness_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ecef1581c907281e920a08651434a15313f0dc39
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physical_intuition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physical_intuition_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physics.yaml b/lm_eval/tasks/bigbench/generate_until/physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39bc786bae05862d66b4f358313feee70ee8d14a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physics.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physics_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml b/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3fcfd47776ba5be480ed396fb98534e3cc7316aa
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/physics_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: physics_questions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_physics_questions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57b65cfd3b09ccbf473a6788f28777d05b71112b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: play_dialog_same_or_different_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_play_dialog_same_or_different_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23775493c1ffe4e82c833515eef998f767b401db
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: polish_sequence_labeling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_polish_sequence_labeling_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70da2d747022062c552856c3594c5033b1401562
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: presuppositions_as_nli_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_presuppositions_as_nli_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fb5b23036510e8256774fb0d32964a590ff9dfe
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: qa_wikidata_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_qa_wikidata_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/question_selection.yaml b/lm_eval/tasks/bigbench/generate_until/question_selection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e2321a8db770ea9e20761f5b7b117cbdeb7b583
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/question_selection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: question_selection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_question_selection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..948bfb0c478b96a8e1285819748f905acfc004b1
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: real_or_fake_text_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_real_or_fake_text_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b371d6e37baabaadb7a7e7424a12cd9dd7b81b9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: reasoning_about_colored_objects_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_reasoning_about_colored_objects_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd8cd4d8563d4be2b92e18fcd48adc13d6c06f9e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: repeat_copy_logic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_repeat_copy_logic_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/rephrase.yaml b/lm_eval/tasks/bigbench/generate_until/rephrase.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16a337dbc2a8568cc36245f34b7eccaf28ed2548
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/rephrase.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: rephrase_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_rephrase_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..745cdb3244845caa9914fae7073b29f64f9773bb
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: riddle_sense_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_riddle_sense_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml b/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9ceddad3293c7c5fc315302962a63f61274b322
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/ruin_names.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ruin_names_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_ruin_names_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4968e441daa4b119bcaf0e5ae5f33d2acfd5a4a6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: salient_translation_error_detection_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_salient_translation_error_detection_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..122f66e7da0ec45e780fbb727809452c6ef64036
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: scientific_press_release_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_scientific_press_release_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..276c997a1a6ea5d582cc89fe3ac858389aa287c3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_in_context_sparc_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_semantic_parsing_in_context_sparc_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39307d92fc3d5f78037102153cfd4e9cc0bb4b48
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: semantic_parsing_spider_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_semantic_parsing_spider_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..263b453fac68a15afa2b8d4ac14328fe6e096124
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sentence_ambiguity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sentence_ambiguity_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c33b1c8b1f0be9a26c8c5bc165195828a692d6d5
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: similarities_abstraction_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_similarities_abstraction_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6eb9cd87e782bdb6aa857d2550c515a2db9382fe
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simp_turing_concept_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simp_turing_concept_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ff5a1b1a8f51346978d03fd34cb6ad780f85f0b
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d1309732627fa2701012c7c53de12f42c0408cf
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_multiple_choice_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57052288e7fed1fabbe9a2c572b10c99f9a1fdcd
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_json_subtasks_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_json_subtasks_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..393ec8843a009267ea2515fe21105b50fed672e2
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_arithmetic_multiple_targets_json_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44960774fb04a69f7e2c24fa248567290923b6c3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_ethical_questions_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_ethical_questions_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3310fa2126ea3c2601e4e4e16cdf22df06e8c4f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: simple_text_editing_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_simple_text_editing_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/snarks.yaml b/lm_eval/tasks/bigbench/generate_until/snarks.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d362537a181c1f6d3f72f139253f94d04b8154b6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/snarks.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: snarks_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_snarks_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml b/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ba7721de1664e92a1f2de1359c44a5a1bf2e23c
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/social_iqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_iqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_social_iqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/social_support.yaml b/lm_eval/tasks/bigbench/generate_until/social_support.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc00bb83755f75220a068b9c97047ec02e1eafed
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/social_support.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: social_support_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_social_support_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..474c08aeb104a3ad171efe2975ab6a6d86c51e2a
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sports_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sports_understanding_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml b/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5405d92e2eea8199985004288270fc1c50bce96
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/strange_stories.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strange_stories_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_strange_stories_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml b/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47c4b25c971fbbf78c5d62ee79de7c0699af2ba9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/strategyqa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: strategyqa_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_strategyqa_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0705a250288610ebd7162a6a730dd1fef58973c3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sufficient_information_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_sufficient_information_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e276c4a051d1507991e00499f344c72fe42a4147
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: suicide_risk_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_suicide_risk_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c218adb365d9d545fe9806c6d27e50390430ddea
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swahili_english_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_swahili_english_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a13d6f7fe014a2ab9a55fdb86cff68f8cb3401d
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swedish_to_german_proverbs_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_swedish_to_german_proverbs_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cca33bf67e954f18336d8becfb39d75c0e37df56
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: symbol_interpretation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_symbol_interpretation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..414dc51b137fb55037b5b9bc109bba116ee72d34
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: temporal_sequences_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_temporal_sequences_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/tense.yaml b/lm_eval/tasks/bigbench/generate_until/tense.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..480b95ec56dfe519b1446a0b7c3b3af00c930014
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/tense.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tense_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_tense_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/timedial.yaml b/lm_eval/tasks/bigbench/generate_until/timedial.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..854d8642b93197453e8e2d5242c8c1aeb30b519f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/timedial.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: timedial_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_timedial_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml b/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47a301cf705d5abc403ddfa122b989bef2e82099
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/topical_chat.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: topical_chat_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_topical_chat_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c02866c8f07d5d8d9fdfd0459bbd01f327d19b3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tracking_shuffled_objects_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_tracking_shuffled_objects_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9972f4034148bd4f8f4b59b122a89a416f3d5c2f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: understanding_fables_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_understanding_fables_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f0e914c87cb31eea9b9524c4552eca2234eadce
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: undo_permutation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_undo_permutation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f3747c46a0317851c8cc242458793504e0fd657
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_conversion_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unit_conversion_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34c882dc1dde88d9b57144260b4f90390f548ce6
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unit_interpretation_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unit_interpretation_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..deddb77dbb72a092233b71562ebcfa277160e92e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: unnatural_in_context_learning_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_unnatural_in_context_learning_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f2ad8d3fd46a37ffc4fad10c1d927324054e043
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vitaminc_fact_verification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_vitaminc_fact_verification_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a1487ab41c445cda992e30235947c6e8e9f01db
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: what_is_the_tao_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_what_is_the_tao_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc05c377785c652d603e275b6e9df7608eeef5fc
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: which_wiki_edit_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_which_wiki_edit_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/winowhy.yaml b/lm_eval/tasks/bigbench/generate_until/winowhy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99ff22d9c7f80dc3d05cfed74ec8749e7b8790d3
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/winowhy.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: winowhy_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_winowhy_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml b/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16be6060b7700a43fb4f1084fd753e72d370b20e
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/word_sorting.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_sorting_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_word_sorting_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5632a79c639f23b9635a810176a5ea10343c506f
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: word_unscrambling_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_word_unscrambling_generate_until
diff --git a/lm_eval/tasks/bigbench/generate_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e4db21f83a051b72428640787c1776761a71518
--- /dev/null
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
@@ -0,0 +1,19 @@
+group: bigbench
+dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+output_type: generate_until
+dataset_kwargs:
+  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
+  # subtask_name: null
+test_split: default
+doc_to_text: inputs
+doc_to_target: "{{targets[0]}}"
+generation_kwargs:
+  max_length: 128
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_punctuation: true
+  - metric: !function aux_metric.token_edit_distance # pip install textdistance
+    aggregation: mean
+    higher_is_better: false
diff --git a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
deleted file mode 100644
index dd041fdd791de86580bf2971bba751fe820bbd9e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: abstract_narrative_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_abstract_narrative_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
deleted file mode 100644
index 9e7239279560426755127d4c7a129f8acede9893..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: anachronisms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_anachronisms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
deleted file mode 100644
index 3d2e82b4598735db74efb89a9ad6252ad1a938b4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: analogical_similarity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_analogical_similarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
deleted file mode 100644
index a84250495cf14503af50c7fc967b877182aafa04..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: analytic_entailment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_analytic_entailment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
deleted file mode 100644
index be296b1b4d15b433421c5d04d0614f49894ded84..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: arithmetic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
deleted file mode 100644
index d199e8a5c9265d28cd4b09ae855f13917ab5cd28..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/ascii_word_recognition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: ascii_word_recognition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_ascii_word_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
deleted file mode 100644
index 65d8177c0eb144243a5acdbbe5feecd53332e936..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/authorship_verification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: authorship_verification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_authorship_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
deleted file mode 100644
index 3ce364274dc112831c054e9e35e75a7282a19d50..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/auto_categorization.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: auto_categorization_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_auto_categorization_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml b/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
deleted file mode 100644
index e25bee2406d26a2cd8a798dcd84e3cc48266f73a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/auto_debugging.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: auto_debugging_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_auto_debugging_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml b/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
deleted file mode 100644
index d1d45477c661d56b2e7e9bd883eb4a01a1c415d9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/bbq_lite_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: bbq_lite_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_bbq_lite_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml b/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
deleted file mode 100644
index a20da27f4f6826c4920177dbfefea059dce234ea..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/bridging_anaphora_resolution_barqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: bridging_anaphora_resolution_barqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_bridging_anaphora_resolution_barqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml b/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
deleted file mode 100644
index 2b9c89afe7c23af37eff91b2904fb8e31404a61e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/causal_judgment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: causal_judgment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_causal_judgment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml b/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
deleted file mode 100644
index 5dd231087daadec60bd28f6ea16c5da5e608a8e3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/cause_and_effect.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cause_and_effect_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cause_and_effect_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml b/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
deleted file mode 100644
index 066817692040e2a482597e4b7c51d7ae4bb9a246..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/checkmate_in_one.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: checkmate_in_one_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_checkmate_in_one_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml b/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
deleted file mode 100644
index 6a9a088e06548d0327ddb74267e4275b6d92c54a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/chess_state_tracking.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: chess_state_tracking_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_chess_state_tracking_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml b/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
deleted file mode 100644
index f393708800aa99aecacd1fff49909f480e887213..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/chinese_remainder_theorem.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: chinese_remainder_theorem_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_chinese_remainder_theorem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml b/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
deleted file mode 100644
index 6bad67972913eb83dfb507848fb0ffb0635708bc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/cifar10_classification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cifar10_classification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cifar10_classification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml b/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
deleted file mode 100644
index de1f78293c7388eff725818c9940c572aeaf333a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/code_line_description.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: code_line_description_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_code_line_description_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml b/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
deleted file mode 100644
index 83feca88c4723e85cd84a795fb2173a7d8bb2861..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/codenames.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: codenames_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_codenames_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/color.yaml b/lm_eval/tasks/bigbench/greedy_until/color.yaml
deleted file mode 100644
index 5aa9c1a9a1e6c64caf466725674ab780e11edca3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/color.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: color_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_color_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml b/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
deleted file mode 100644
index ec0fdc44bd70d1a57158578506d7fe2268deccc2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/common_morpheme.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: common_morpheme_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_common_morpheme_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml b/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
deleted file mode 100644
index 5eaba446dd726dd72d0d24815f60211f60bff053..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/conceptual_combinations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: conceptual_combinations_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_conceptual_combinations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml b/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
deleted file mode 100644
index afae81849cba036f82faa12dbd4dd5087b7fafc9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/conlang_translation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: conlang_translation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_conlang_translation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml b/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
deleted file mode 100644
index bb7eba6434114c04e7a08e5e93911274a248d558..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/contextual_parametric_knowledge_conflicts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: contextual_parametric_knowledge_conflicts_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_contextual_parametric_knowledge_conflicts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml b/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
deleted file mode 100644
index ae7f6b9fe00dc43e58c6b3924e07e02066ccd1b8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/crash_blossom.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: crash_blossom_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_crash_blossom_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml b/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
deleted file mode 100644
index 7d56bbc2bf92e9d015eea7009adff1c49584ae7c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/crass_ai.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: crass_ai_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_crass_ai_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml b/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
deleted file mode 100644
index 37fd99ad732c2132a7773d358c87e4df97911d52..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/cryobiology_spanish.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cryobiology_spanish_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cryobiology_spanish_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml b/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
deleted file mode 100644
index 645777381f0adf45f9c79fb14788ff39ad9c6956..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/cryptonite.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cryptonite_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cryptonite_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml b/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
deleted file mode 100644
index 9279c295c7fd2e0fdc36b9703fd76213b94d2a2b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/cs_algorithms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: cs_algorithms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_cs_algorithms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
deleted file mode 100644
index 014d57e609c4a0f164a25733fc952176914bb6dd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/dark_humor_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: dark_humor_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_dark_humor_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
deleted file mode 100644
index 999a7e71e44d2a52a7c4a181e9b20aa559467836..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/date_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: date_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_date_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
deleted file mode 100644
index db25589d72792c51d5391bdd89a14154407dbad2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/disambiguation_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: disambiguation_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_disambiguation_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
deleted file mode 100644
index ae8941e8abb0643a6295f5b85e903e36c2310e91..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/discourse_marker_prediction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: discourse_marker_prediction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_discourse_marker_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
deleted file mode 100644
index 0086850a7ef31f0fb37a357325a1cf8adb456008..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/disfl_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: disfl_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_disfl_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml b/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
deleted file mode 100644
index e8de00930f3a40e47926d9e57e22d35cd1d986b2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/dyck_languages.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: dyck_languages_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_dyck_languages_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
deleted file mode 100644
index 553691512aa69d6182ebf884ba7fa8618ee4694b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/elementary_math_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: elementary_math_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_elementary_math_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml b/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
deleted file mode 100644
index 4553ede7c7e3d2df9138fb248143377b6742e138..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/emoji_movie.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: emoji_movie_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_emoji_movie_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml b/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
deleted file mode 100644
index e570e24a6e704729c73dd593b381e39502aa6f44..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/emojis_emotion_prediction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: emojis_emotion_prediction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_emojis_emotion_prediction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml b/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
deleted file mode 100644
index d4f2f3cfcc82c50fda038c460827812a7056f2b3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/empirical_judgments.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: empirical_judgments_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_empirical_judgments_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
deleted file mode 100644
index b76287967fde33132a0a67a991e33a02026aa20a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/english_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: english_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
deleted file mode 100644
index ea719e1d1fa013009061483927fe1da1d2dfd640..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/english_russian_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: english_russian_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_english_russian_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
deleted file mode 100644
index e3d89fc252e445511df97bd497fac2fb461efed8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: entailed_polarity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_entailed_polarity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml b/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
deleted file mode 100644
index e416a0596f88310c7baf3af23cfeef1b0652336e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/entailed_polarity_hindi.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: entailed_polarity_hindi_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_entailed_polarity_hindi_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
deleted file mode 100644
index 8f8efc4e1ad961e6694941d797878fd8ff8b01a4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/epistemic_reasoning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: epistemic_reasoning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_epistemic_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml b/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
deleted file mode 100644
index b35240c49b595eeeecae0696e650c6def383d694..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/evaluating_information_essentiality.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: evaluating_information_essentiality_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_evaluating_information_essentiality_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml b/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
deleted file mode 100644
index f83e4081817540e3a1a612029ddbef71a5cc4aa0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/fact_checker.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: fact_checker_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_fact_checker_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml b/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
deleted file mode 100644
index ab38359d56e1d31d84733a9317da075f45cb5b97..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/fantasy_reasoning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: fantasy_reasoning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_fantasy_reasoning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml b/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
deleted file mode 100644
index bf1e33e0903a8a62ba05e33edd63fe726f2426cb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/few_shot_nlg.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: few_shot_nlg_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_few_shot_nlg_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
deleted file mode 100644
index 184cd4e6bd6c4188e86ca3cfa14d106c6ea5a063..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/figure_of_speech_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: figure_of_speech_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_figure_of_speech_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml b/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
deleted file mode 100644
index cb1915b8741259e812fed40c52344ca614e46797..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/formal_fallacies_syllogisms_negation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: formal_fallacies_syllogisms_negation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_formal_fallacies_syllogisms_negation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gem.yaml b/lm_eval/tasks/bigbench/greedy_until/gem.yaml
deleted file mode 100644
index aa43ca45f5ad63c97f9cb8117c6086809751957a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/gem.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gem_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gem_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml b/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
deleted file mode 100644
index 6471e57709c4178548d0d242f095b88cf9424557..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/gender_inclusive_sentences_german.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gender_inclusive_sentences_german_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gender_inclusive_sentences_german_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
deleted file mode 100644
index 93a3f8753b55518164eeebd95c0c736f89159227..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/general_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: general_knowledge_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_general_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml b/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
deleted file mode 100644
index c3a5d9a7651151317ea48aa4c3ac10cbba15fb19..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/geometric_shapes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: geometric_shapes_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_geometric_shapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml b/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
deleted file mode 100644
index 6fd557d33d6664caac111e667b2d0c2255e7642f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/goal_step_wikihow.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: goal_step_wikihow_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_goal_step_wikihow_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
deleted file mode 100644
index c4416b100a52df2daa9fa08ebfc958af59b3f8a6..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/gre_reading_comprehension.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: gre_reading_comprehension_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_gre_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml b/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
deleted file mode 100644
index 4060824c4ed9d30a2d3d4f567ac37337c3dc7ffe..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/hhh_alignment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hhh_alignment_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hhh_alignment_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml b/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
deleted file mode 100644
index 5c4791b41a52f225465c6a0947e67eba820ccbd9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/hindi_question_answering.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hindi_question_answering_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hindi_question_answering_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml b/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
deleted file mode 100644
index 040441f783dcee989659f6a3f408b74a87ac12e9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/hindu_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hindu_knowledge_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hindu_knowledge_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml b/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
deleted file mode 100644
index 0eb98e5193510946cda6df841a6cd71948462cb2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/hinglish_toxicity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hinglish_toxicity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hinglish_toxicity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml b/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
deleted file mode 100644
index c5541571cb731b383d23c9fc2d7aa931a1cf6cbc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/human_organs_senses.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: human_organs_senses_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_human_organs_senses_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml b/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
deleted file mode 100644
index 4368f4c9a3c4d00fd9e1596fdb27b48733ad7f35..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/hyperbaton.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: hyperbaton_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_hyperbaton_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
deleted file mode 100644
index 2c08703ea20557cc332a6dfa29c865cba1c632dd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/identify_math_theorems.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: identify_math_theorems_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_identify_math_theorems_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml b/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
deleted file mode 100644
index 9cb39d0d4e0fbcdad470fe184900b3053dd757a1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/identify_odd_metaphor.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: identify_odd_metaphor_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_identify_odd_metaphor_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml b/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
deleted file mode 100644
index e216762c980a818787d4f9f38cb4dbaf1f17c186..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/implicatures.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: implicatures_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_implicatures_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml b/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
deleted file mode 100644
index c7a82a1090de5128bfe0396076e64e208e9c96d1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/implicit_relations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: implicit_relations_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_implicit_relations_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml b/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
deleted file mode 100644
index 4839afa21364e883c9ef6c97c4d4cb1d4efbbd99..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/intent_recognition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: intent_recognition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_intent_recognition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
deleted file mode 100644
index 62643a461b0c700ef3a01160064ed4683de46135..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_nli.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: international_phonetic_alphabet_nli_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_international_phonetic_alphabet_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml b/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
deleted file mode 100644
index 05feb4f5b87a70e264a23779acee364d8c1ba0ac..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/international_phonetic_alphabet_transliterate.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: international_phonetic_alphabet_transliterate_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_international_phonetic_alphabet_transliterate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml b/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
deleted file mode 100644
index 57745d23e1d9a51ca99b84ae9213a1c2cc35a9ab..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/intersect_geometry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: intersect_geometry_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_intersect_geometry_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
deleted file mode 100644
index b49dfb4435f3e303a64bf4258f03f646e367d50c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/irony_identification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: irony_identification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_irony_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
deleted file mode 100644
index 293ff6c2d328bfe92115368a54494068e34f2bc3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/kanji_ascii.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: kanji_ascii_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_kanji_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml b/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
deleted file mode 100644
index 00eeb32ab8604d57df88de9fdead1c5581e5660b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/kannada.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: kannada_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_kannada_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml b/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
deleted file mode 100644
index d313e1ce52d79037cc510980787512aee5d90a09..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/key_value_maps.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: key_value_maps_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_key_value_maps_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml b/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
deleted file mode 100644
index d72e1d370a5c988b293a2afeac95031d5bc3ea9d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/known_unknowns.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: known_unknowns_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_known_unknowns_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml b/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
deleted file mode 100644
index 61e85b535701cf2963ab1d249c0ae4ada0b075ac..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/language_games.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: language_games_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_language_games_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml b/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
deleted file mode 100644
index 8db6563718c5456e0300de843c23f46bc7d3a6c1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/language_identification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: language_identification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_language_identification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
deleted file mode 100644
index db6e9832a4516dcfca0fa59438832b37c6fb1411..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/linguistic_mappings.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: linguistic_mappings_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_linguistic_mappings_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml b/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
deleted file mode 100644
index 4e3981f48602fa18c7108a539337ef78191bd519..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/linguistics_puzzles.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: linguistics_puzzles_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_linguistics_puzzles_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml b/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
deleted file mode 100644
index 32afff69d9d8b04057fff6f05fd450def35c7797..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/list_functions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: list_functions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_list_functions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml b/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
deleted file mode 100644
index a1d1b5b1c2d0cbc27538a6d41ae41df8450330e1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/logic_grid_puzzle.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logic_grid_puzzle_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logic_grid_puzzle_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
deleted file mode 100644
index 201c04ae3f233f0eb7eb16c2004811ac0e8fb57e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_args.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_args_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_args_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
deleted file mode 100644
index 1b77561ddf8d65a65c10d751a221f162e12cf130..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_deduction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_deduction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_deduction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
deleted file mode 100644
index af3e9ea45f20cb57e292c3a3345c87c481992ab4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_fallacy_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_fallacy_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_fallacy_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml b/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
deleted file mode 100644
index 4d4ffe1dcf75b57605ed879e7aa0099c822b13a5..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/logical_sequence.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: logical_sequence_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_logical_sequence_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml b/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
deleted file mode 100644
index 84d0f4199bef82f5bc82689162d09f290ad5f422..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/mathematical_induction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mathematical_induction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mathematical_induction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml b/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
deleted file mode 100644
index 956aa5f038bcd768488400ab7d37dfb7fe0f816e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/matrixshapes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: matrixshapes_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_matrixshapes_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
deleted file mode 100644
index 7fd4e53c003279fe0322911247dab96cde8ad0af..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_boolean.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: metaphor_boolean_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_metaphor_boolean_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
deleted file mode 100644
index 12b79d44212b2998571802864a100be04cacea32..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/metaphor_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: metaphor_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_metaphor_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
deleted file mode 100644
index 459aec5742a179cc202248a7e2a746b294f41800..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/minute_mysteries_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: minute_mysteries_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_minute_mysteries_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
deleted file mode 100644
index 25038ae3fe91db622ab923e5323ab84c17ca2394..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: misconceptions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_misconceptions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml b/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
deleted file mode 100644
index 676d94ea29bfbc5f76c375d8d8237194f8799333..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/misconceptions_russian.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: misconceptions_russian_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_misconceptions_russian_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml b/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
deleted file mode 100644
index 19c9a82bf80543c62c79690e748fe20b861c3e05..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/mnist_ascii.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mnist_ascii_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mnist_ascii_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml b/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
deleted file mode 100644
index 313b5b9deb5126ec98456cb2d957187047ecf24c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/modified_arithmetic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: modified_arithmetic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_modified_arithmetic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml b/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
deleted file mode 100644
index f478ed244e610eabd4fbf44ece3bf761930fe127..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/moral_permissibility.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: moral_permissibility_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_moral_permissibility_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
deleted file mode 100644
index 98e06e5d866b6a7ab62d8d7516befdffe774b521..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/movie_dialog_same_or_different.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: movie_dialog_same_or_different_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_movie_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml b/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
deleted file mode 100644
index 7cd021a469520fd6b3f4678cfb84d08ff4cbcacb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/movie_recommendation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: movie_recommendation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_movie_recommendation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml b/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
deleted file mode 100644
index 92b84838f5be8d085cbeb491a5ee71ef10b1adad..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/mult_data_wrangling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: mult_data_wrangling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_mult_data_wrangling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml b/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
deleted file mode 100644
index ac4f94322945d41a3a731be4b6ea1bf0c08c6b4b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/multiemo.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: multiemo_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_multiemo_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml b/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
deleted file mode 100644
index 0b87004de4468bca9df417f7c80a17c811d63f53..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/natural_instructions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: natural_instructions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_natural_instructions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml b/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
deleted file mode 100644
index 85fd618b185fb40d7509e7db4179c335cce36cbf..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/navigate.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: navigate_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_navigate_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml b/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
deleted file mode 100644
index 863b0a85ea775937bf30e3f94fcb52976c152083..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/nonsense_words_grammar.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: nonsense_words_grammar_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_nonsense_words_grammar_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml b/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
deleted file mode 100644
index b3b0880697976ba0ca471c5ed7b053ca37408d3b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/novel_concepts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: novel_concepts_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_novel_concepts_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml b/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
deleted file mode 100644
index fc0d611942ffd38577893529fa88bddd3840816c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/object_counting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: object_counting_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_object_counting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml b/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
deleted file mode 100644
index 90d0fd9343cd2e7dd4e5c011eb9bd62699cc66af..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/odd_one_out.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: odd_one_out_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_odd_one_out_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/operators.yaml b/lm_eval/tasks/bigbench/greedy_until/operators.yaml
deleted file mode 100644
index d4ad9b91f2a9d56411bb19577aa03554e676e1c9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/operators.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: operators_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_operators_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml b/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
deleted file mode 100644
index c661e1a7cc131796af30b6230d43664595287946..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/paragraph_segmentation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: paragraph_segmentation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_paragraph_segmentation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
deleted file mode 100644
index 4ea51e212ed3ed12bc21b41b86d10f0cc4b25d06..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_qa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: parsinlu_qa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_parsinlu_qa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml b/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
deleted file mode 100644
index 967741fd14569dff53606344588533cd2d12e7b7..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/parsinlu_reading_comprehension.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: parsinlu_reading_comprehension_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_parsinlu_reading_comprehension_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml b/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
deleted file mode 100644
index 5e59b741e4bc872d00bce8ee64192c838fe8bf94..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/penguins_in_a_table.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: penguins_in_a_table_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_penguins_in_a_table_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml b/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
deleted file mode 100644
index a7ed5a8246b89699fc9ace627af9dc7b34c021b3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/periodic_elements.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: periodic_elements_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_periodic_elements_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml b/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
deleted file mode 100644
index 087d46880e56e25210ee40a0f4a6cd96d4e9ea20..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/persian_idioms.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: persian_idioms_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_persian_idioms_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml b/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
deleted file mode 100644
index c2da5cce26b2c2463e59c078b530a9c93c78ef49..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/phrase_relatedness.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: phrase_relatedness_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_phrase_relatedness_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml b/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
deleted file mode 100644
index 1482fe65ebb087b953e3c7b7fbfc9edbdb127ca9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/physical_intuition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physical_intuition_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physical_intuition_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics.yaml b/lm_eval/tasks/bigbench/greedy_until/physics.yaml
deleted file mode 100644
index 7fade7b3a0097794f61c7ba083ce5737477338df..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physics_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physics_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
deleted file mode 100644
index bf332361d9d7aa909ad8e284d6aef3278483af57..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/physics_questions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: physics_questions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_physics_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml b/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
deleted file mode 100644
index 1ddf7ca7b0b6169520a2d6b0f5ee1e7fb7a223b8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/play_dialog_same_or_different.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: play_dialog_same_or_different_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_play_dialog_same_or_different_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml b/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
deleted file mode 100644
index 10c8bd98c1341f32542b01e88f4c607e2df20b0e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/polish_sequence_labeling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: polish_sequence_labeling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_polish_sequence_labeling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml b/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
deleted file mode 100644
index 66d0e5eafef08e54202ac49bc8a6371de27e57f1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/presuppositions_as_nli.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: presuppositions_as_nli_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_presuppositions_as_nli_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml b/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
deleted file mode 100644
index 672401103297c3adc0a5c7837dbfb8e3b416281d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/qa_wikidata.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: qa_wikidata_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_qa_wikidata_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml b/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
deleted file mode 100644
index 5652cb3f45dab50d63d3c922b0e3a402aa2666ca..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/question_selection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: question_selection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_question_selection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml b/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
deleted file mode 100644
index c206597bab2675985e78a05bea44de12a2cbe388..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/real_or_fake_text.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: real_or_fake_text_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_real_or_fake_text_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
deleted file mode 100644
index 8b1051e59225afb634b24be95c22dbcee8bfb7c4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/reasoning_about_colored_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: reasoning_about_colored_objects_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_reasoning_about_colored_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml b/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
deleted file mode 100644
index 279ecd0115169b840a2363138a786fe841cd1134..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/repeat_copy_logic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: repeat_copy_logic_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_repeat_copy_logic_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml b/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
deleted file mode 100644
index 90135638133afbbd319cc31890f04944331bd268..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/rephrase.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: rephrase_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_rephrase_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml b/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
deleted file mode 100644
index a11c167d34b1e5f5ae9e46288e29028f566a6aec..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/riddle_sense.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: riddle_sense_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_riddle_sense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml b/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
deleted file mode 100644
index 5074e010ea5a219e6f43b06953652f9b937a2d4f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/ruin_names.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: ruin_names_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_ruin_names_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml b/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
deleted file mode 100644
index 7f2ce43365720d7fb8d4b539f6526e55d3439802..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/salient_translation_error_detection.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: salient_translation_error_detection_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_salient_translation_error_detection_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml b/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
deleted file mode 100644
index 900718827b1d6245ef66c5ceec7026695f0df147..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/scientific_press_release.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: scientific_press_release_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_scientific_press_release_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
deleted file mode 100644
index 93ddccc2ca09a209e6e25c8d46f7c20d2dee5125..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_in_context_sparc.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: semantic_parsing_in_context_sparc_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_semantic_parsing_in_context_sparc_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml b/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
deleted file mode 100644
index cc590faf370e2672bb33b354a4eea22716dd0c07..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/semantic_parsing_spider.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: semantic_parsing_spider_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_semantic_parsing_spider_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml b/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
deleted file mode 100644
index 6cbacb793ebc22d12423b2adad02c5fbedf4239a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/sentence_ambiguity.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sentence_ambiguity_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sentence_ambiguity_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml b/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
deleted file mode 100644
index 10e9a4398bf309f2809ce66cb36431f48080bf6c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/similarities_abstraction.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: similarities_abstraction_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_similarities_abstraction_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml b/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
deleted file mode 100644
index a82b8226a6efea23d0f5d315fddc6ccb3a5cd003..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simp_turing_concept.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simp_turing_concept_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simp_turing_concept_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
deleted file mode 100644
index 8e0a207e30631b670e077df60d7aaf03debe9f6d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
deleted file mode 100644
index df2353252c930bb50f625e3b23e6da0585a537ce..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_multiple_choice.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_multiple_choice_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_multiple_choice_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
deleted file mode 100644
index 2f981fb0658bbae9d309c31f2b9fa96e22b6561e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_json_subtasks.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_json_subtasks_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_json_subtasks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
deleted file mode 100644
index 2bc6cf1615c89b24b3cafe0deb4838b7b7e9babe..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_arithmetic_multiple_targets_json.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_arithmetic_multiple_targets_json_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_arithmetic_multiple_targets_json_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
deleted file mode 100644
index 77e45a58511514a4a063db9cd085efddfd35289e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_ethical_questions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_ethical_questions_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_ethical_questions_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml b/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
deleted file mode 100644
index 1b485d5ca3fd2aeef1a25dc95dc68915976a646b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/simple_text_editing.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: simple_text_editing_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_simple_text_editing_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml b/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
deleted file mode 100644
index 9ccbda748a92ed4dece4474b30fc9d98250fe429..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/snarks.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: snarks_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_snarks_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml b/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
deleted file mode 100644
index 9cbc5ec511a1d423702fa0e22a4be76cb849ff3e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/social_iqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: social_iqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_social_iqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml b/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
deleted file mode 100644
index bcc3a9d1ddc11b0652266e5c34269529a09b1e02..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/social_support.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: social_support_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_social_support_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml b/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
deleted file mode 100644
index 01082a10d67388931fc5fc91a272ffb575ac03ea..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/sports_understanding.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sports_understanding_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sports_understanding_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml b/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
deleted file mode 100644
index a0bf1c463e4ff85954053df8bc84751fa83180fb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/strange_stories.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: strange_stories_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_strange_stories_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml b/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
deleted file mode 100644
index 495d873f3ed4b36dc1fc31d1b858d0ab7757f0d1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/strategyqa.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: strategyqa_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_strategyqa_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml b/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
deleted file mode 100644
index 3484952c1a0e3ae5b2e366e91841b37cc9fb2ee1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/sufficient_information.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: sufficient_information_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_sufficient_information_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml b/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
deleted file mode 100644
index a8e980d537fcbca717ade815bfed9ac0da36bf3e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/suicide_risk.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: suicide_risk_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_suicide_risk_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
deleted file mode 100644
index ff045534a18f2b27ac0dbba7ad877ad4cf301b56..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/swahili_english_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: swahili_english_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_swahili_english_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml b/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
deleted file mode 100644
index 8cbd401ba3ac6d9a37604d5eced0d02b0da70d91..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/swedish_to_german_proverbs.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: swedish_to_german_proverbs_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_swedish_to_german_proverbs_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
deleted file mode 100644
index 3fa4cdbab6969ba2ea510afa5f73f59f0ea2a44b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/symbol_interpretation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: symbol_interpretation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_symbol_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml b/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
deleted file mode 100644
index c20300f81cf7e708164357d4ad1e354ba7b64414..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/temporal_sequences.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: temporal_sequences_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_temporal_sequences_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tense.yaml b/lm_eval/tasks/bigbench/greedy_until/tense.yaml
deleted file mode 100644
index b1b5698d3119cb3bbeb580876e59f62836853887..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/tense.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: tense_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_tense_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml b/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
deleted file mode 100644
index d5f1950e66eea3555e995f7f25893d47eef35757..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/timedial.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: timedial_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_timedial_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml b/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
deleted file mode 100644
index 4ec83039ec7b9aa9ab95663b50e3b25b9457683e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/topical_chat.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: topical_chat_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_topical_chat_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml b/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
deleted file mode 100644
index 27024bee06f9c84577c240d6a59a34408a4d9f04..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/tracking_shuffled_objects.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: tracking_shuffled_objects_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_tracking_shuffled_objects_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml b/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
deleted file mode 100644
index f467652d6328d22985247c3096c2a2dda7836f08..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/understanding_fables.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: understanding_fables_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_understanding_fables_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml b/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
deleted file mode 100644
index d91ff3313a31f2a2ca38d46d40459dc4c8a28fd3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/undo_permutation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: undo_permutation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_undo_permutation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
deleted file mode 100644
index a31929fbdb1c8386d99faabe0c0ea18e9eb3abad..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/unit_conversion.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unit_conversion_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unit_conversion_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml b/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
deleted file mode 100644
index ca4c38be09e356501a0328a71651b8d3129b4fd1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/unit_interpretation.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unit_interpretation_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unit_interpretation_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml b/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
deleted file mode 100644
index 1cc271d25a4c108dfd04d5466822a97b62a09183..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/unnatural_in_context_learning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: unnatural_in_context_learning_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_unnatural_in_context_learning_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml b/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
deleted file mode 100644
index 770e850074ee438a6bbbfd8f85c71b17a053bede..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/vitaminc_fact_verification.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: vitaminc_fact_verification_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_vitaminc_fact_verification_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml b/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
deleted file mode 100644
index 8c60da65bf5bd9c0f2fcca37dc07b4fb45f7bcc9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/what_is_the_tao.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: what_is_the_tao_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_what_is_the_tao_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml b/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
deleted file mode 100644
index 4eda6d089f1f0fbc897fb640bd2eb13ddb5a5b64..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/which_wiki_edit.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: which_wiki_edit_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_which_wiki_edit_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml b/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
deleted file mode 100644
index e065c80cf24572dcb5a3cb25d7ccd84071e5d9f9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/winowhy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: winowhy_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_winowhy_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml b/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
deleted file mode 100644
index caa6f02d512c316921eb7109b51f217761a433ca..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/word_sorting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: word_sorting_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_sorting_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml b/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
deleted file mode 100644
index 774aef15eed52a5c222c86a12c4f5ba2ed5d1025..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until/word_unscrambling.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Generated by utils.py
-dataset_name: word_unscrambling_zero_shot
-include: ../greedy_until_template_yaml
-task: bigbench_word_unscrambling_greedy_until
diff --git a/lm_eval/tasks/bigbench/greedy_until_template_yaml b/lm_eval/tasks/bigbench/greedy_until_template_yaml
deleted file mode 100644
index 130500cc6bda51786d2a72fa4424bbded716f114..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/bigbench/greedy_until_template_yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
-output_type: greedy_until
-dataset_kwargs:
-  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
-  # subtask_name: null
-test_split: default
-doc_to_text: inputs
-doc_to_target: "{{targets[0]}}"
-generation_kwargs:
-  max_length: 128
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_punctuation: true
diff --git a/lm_eval/tasks/code_x_glue/code-text/go.yaml b/lm_eval/tasks/code_x_glue/code-text/go.yaml
index 3a4033c6531329f3d16f1795023d6efc9075650a..f86706523509c828414d933eee8bf366fcd487e6 100644
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_go
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/java.yaml b/lm_eval/tasks/code_x_glue/code-text/java.yaml
index 141673c999930dd51c140d3741d549b7437a99f6..aaad034c95edcebf494fd3196eed047918476131 100644
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_java
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
index c537e50d241b1cc29847ca0cc03948ec3c5986d1..615fffe32053f7fd720a88d3c6b9a414c49380da 100644
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_javascript
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/php.yaml b/lm_eval/tasks/code_x_glue/code-text/php.yaml
index 9137bdaf79e76bce19123363f9ae1a0cde524391..b71a7525f093e5fe74bb5bbddd14d254394bc69a 100644
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_php
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/python.yaml b/lm_eval/tasks/code_x_glue/code-text/python.yaml
index a98bfdba4fe260a4d414b77c11180d68d592f5f6..301251b0467dcff99c26b903870d0dc2b0c891a4 100644
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_python
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
index d6562d4c4f84ea9976ce91877e8817e95e9da845..6a3b4a5a3f24f4b84a483609260d4e3c43c441e6 100644
--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -5,7 +5,7 @@ dataset_path: CM/codexglue_code2text_ruby
 training_split: train
 validation_split: validation
 test_split: test
-output_type: greedy_until
+output_type: generate_until
 generation_kwargs:
   num_beams: 10
   max_length: 128
diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml
index 5b89119251ed1b58d213003720dd0cb9eee28e95..f928e1f7312241db6722d575243c7197b32e1d27 100644
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -1,6 +1,6 @@
 task: coqa
 dataset_path: EleutherAI/coqa
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text
diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml
index 973fff7b88d06ce159dfb4411aaf4cdc51c4dd56..28560312d563659c9a03daa01528deef4fbff199 100644
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
@@ -1,6 +1,6 @@
 task: drop
 dataset_path: EleutherAI/drop
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index 6236e51978c3e58d1cddd8bb1e0d5dab08d41c8a..7138140088f74d84df07bbfd476913fd96111dde 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_cot
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
 Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
@@ -14,8 +14,7 @@ Q: There were nine computers in the server room. Five more computers were instal
 Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
 Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
 Q: {{question}}\n\nA:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
+doc_to_target: " {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
   - metric: exact_match
     aggregation: mean
@@ -25,6 +24,8 @@ metric_list:
     regexes_to_ignore:
       - ","
       - "\\$"
+      - "(?s).*#### "
+      - "\n\n"
 generation_kwargs:
   until:
     - "Q:"
@@ -37,5 +38,5 @@ filter_list:
   - name: "get-answer"
     filter:
       - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
       - function: "take_first"
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index ebd1b4e1a3bf9b724fb24797e0d543ef4feed841..9cf161580cacdc24c057973df3fb13a7e50399ca 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -1,15 +1,14 @@
 group:
   - math_word_problems
-task: gsm8k_yaml
+task: gsm8k
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test
 doc_to_text: "Question: {{question}}\nAnswer:"
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
 metric_list:
   - metric: exact_match
     aggregation: mean
@@ -19,7 +18,7 @@ metric_list:
     regexes_to_ignore:
       - ","
       - "\\$"
-      - ".*### "
+      - "(?s).*#### "
 generation_kwargs:
   until:
     - "\n\n"
@@ -28,9 +27,9 @@ generation_kwargs:
   temperature: 0.0
 repeats: 1
 num_fewshot: 5
-# filter_list:
-#   - name: "get-answer"
-#     filter:
-#       - function: "regex"
-#         regex_pattern: "### (\\-?[0-9\\.\\,]+)"
-#       - function: "take_first"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
index a7e712cce18f8f778c0cd9ff8337e25e11a50f3c..04b433f6c90cf3ed722db37cfb6c490c6c06701b 100644
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
@@ -9,7 +9,6 @@
 # template_aliases:  #"{% set answer_choices = range(1, 11)|list %}"
 # doc_to_text: 'Activity: "{{activity}}"\nRating:'
 # doc_to_target: "{{answer_choices[label]}}"
-# gold_alias: "{{label}}" # this will be cast to an int.
 # metric_list:
 #   - metric: acc
 # TODO: we want this to be implemented as a winograd_schema task type, actually
diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml
index 7701426ee13f0e2bf5dd9941f66a9ab471a4d3ce..9f945be44c464aa2a008e38cb6f7950d22ab4f56 100644
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -1,7 +1,7 @@
 task: logieval
 dataset_path: baber/logiqa2
 dataset_name: logieval
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 # Instructions + {content}
diff --git a/lm_eval/tasks/mgsm/direct/direct_yaml b/lm_eval/tasks/mgsm/direct/direct_yaml
index 6eae3257510142b11ed6717a5ace71f85b958917..0833ff8aae29768831af57db25bc589e494359ef 100644
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -4,7 +4,7 @@
 group: mgsm_direct
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/mgsm/en_cot/cot_yaml b/lm_eval/tasks/mgsm/en_cot/cot_yaml
index f5cf60d99659923fddcf2248f3f48c5a6cfd6bed..06308fd7f7aa539292000542349599929cb1f480 100644
--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/mgsm/native_cot/cot_yaml b/lm_eval/tasks/mgsm/native_cot/cot_yaml
index f5cf60d99659923fddcf2248f3f48c5a6cfd6bed..06308fd7f7aa539292000542349599929cb1f480 100644
--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -4,7 +4,7 @@
 group: mgsm_cot_native
 dataset_path: juletxara/mgsm
 dataset_name: null  # Overridden by language-specific config.
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 target_delimiter: ""
diff --git a/lm_eval/tasks/minerva_math/README.md b/lm_eval/tasks/minerva_math/README.md
index 7ca5d652a0f05c0f98589c39e213db338c1bcfb1..7bfb7d5015a5b465a47b279a4dfb29ae170a5bfc 100644
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
@@ -37,7 +37,7 @@ Eprint = {arXiv:2206.14858},
 #### Groups
 
 - `math_word_problems`
-- `greedy_until`
+- `generate_until`
 
 #### Tasks
 
diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
index 3ad3802bf500f1eaa61fe9e112726de3bf13bed1..8aca7ad5341917beb9007b679e0e517a5807a080 100644
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -4,7 +4,7 @@ task: minerva_math_algebra
 dataset_path: EleutherAI/hendrycks_math
 process_docs: !function utils.process_docs
 dataset_name: algebra
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 test_split: test
 doc_to_text:  !function utils.doc_to_text
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 542e11b2d5a2c75495c5a5a78340665db6db1e6a..2bf27ac0f74f03f8ca98904e1c7775ec0136e8f9 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -1,5 +1,5 @@
 """
-Take in a YAML, and output all other splits with this YAML
+Take in a YAML, and output all "other" splits with this YAML
 """
 import os
 import yaml
@@ -10,73 +10,74 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger
 
-SUBJECTS = [
-    "abstract_algebra",
-    "anatomy",
-    "astronomy",
-    "business_ethics",
-    "clinical_knowledge",
-    "college_biology",
-    "college_chemistry",
-    "college_computer_science",
-    "college_mathematics",
-    "college_medicine",
-    "college_physics",
-    "computer_security",
-    "conceptual_physics",
-    "econometrics",
-    "electrical_engineering",
-    "elementary_mathematics",
-    "formal_logic",
-    "global_facts",
-    "high_school_biology",
-    "high_school_chemistry",
-    "high_school_computer_science",
-    "high_school_european_history",
-    "high_school_geography",
-    "high_school_government_and_politics",
-    "high_school_macroeconomics",
-    "high_school_mathematics",
-    "high_school_microeconomics",
-    "high_school_physics",
-    "high_school_psychology",
-    "high_school_statistics",
-    "high_school_us_history",
-    "high_school_world_history",
-    "human_aging",
-    "human_sexuality",
-    "international_law",
-    "jurisprudence",
-    "logical_fallacies",
-    "machine_learning",
-    "management",
-    "marketing",
-    "medical_genetics",
-    "miscellaneous",
-    "moral_disputes",
-    "moral_scenarios",
-    "nutrition",
-    "philosophy",
-    "prehistory",
-    "professional_accounting",
-    "professional_law",
-    "professional_medicine",
-    "professional_psychology",
-    "public_relations",
-    "security_studies",
-    "sociology",
-    "us_foreign_policy",
-    "virology",
-    "world_religions",
-]
+SUBJECTS = {
+    "abstract_algebra": "stem",
+    "anatomy": "stem",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="flan")
+    parser.add_argument("--save_prefix_path", default="mmlu")
     parser.add_argument("--cot_prompt_path", default=None)
     parser.add_argument("--task_prefix", default="")
+    parser.add_argument("--group_prefix", default="")
     return parser.parse_args()
 
 
@@ -84,7 +85,7 @@ if __name__ == "__main__":
 
     args = parse_args()
 
-    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
     base_yaml_name = os.path.split(args.base_yaml_path)[-1]
     with open(args.base_yaml_path) as f:
         base_yaml = yaml.full_load(f)
@@ -95,7 +96,12 @@ if __name__ == "__main__":
         with open(args.cot_prompt_path) as f:
             cot_file = json.load(f)
 
-    for subject in tqdm(SUBJECTS):
+    ALL_CATEGORIES = []
+    for subject, category in tqdm(SUBJECTS.items()):
+
+        if category not in ALL_CATEGORIES:
+            ALL_CATEGORIES.append(category)
+
         if args.cot_prompt_path is not None:
             description = cot_file[subject]
         else:
@@ -103,9 +109,14 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
+            "group": f"mmlu_{args.task_prefix}_{category}"
+            if args.task_prefix != ""
+            else f"mmlu_{category}",
+            "group_alias": category.replace("_", " "),
             "task": f"mmlu_{args.task_prefix}_{subject}"
             if args.task_prefix != ""
             else f"mmlu_{subject}",
+            "task_alias": subject.replace("_", " "),
             "dataset_name": subject,
             "description": description,
         }
@@ -116,7 +127,33 @@ if __name__ == "__main__":
             yaml.dump(
                 yaml_dict,
                 yaml_file,
-                width=float("inf"),
+                # width=float("inf"),
                 allow_unicode=True,
                 default_style='"',
             )
+
+    if args.task_prefix != "":
+        mmlu_subcategories = [
+            f"mmlu_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
+        ]
+    else:
+        mmlu_subcategories = [f"mmlu_{category}" for category in ALL_CATEGORIES]
+
+    if args.group_prefix != "":
+        file_save_path = args.group_prefix + ".yaml"
+    else:
+        file_save_path = args.save_prefix_path + ".yaml"
+
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
+    with open(file_save_path, "w") as yaml_file:
+        yaml.dump(
+            {
+                "group": f"mmlu_{args.task_prefix}"
+                if args.task_prefix != ""
+                else "mmlu",
+                "task": mmlu_subcategories,
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index bd989c408aeca666cc31babfd39f5a5c7b916f56..ebac529e29fa1c783a9efabc71f638c1d3965fd2 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
 fewshot_config:
@@ -12,6 +12,6 @@ metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
     aggregation: mean
-    higher_is_better: true
+    higher_is_better: false
diff --git a/lm_eval/tasks/mmlu/default/_mmlu.yaml b/lm_eval/tasks/mmlu/default/_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..584de02993acedc37b6d508967334f9a55675f6b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu
+task:
+  - mmlu_stem
+  - mmlu_other
+  - mmlu_social_sciences
+  - mmlu_humanities
diff --git a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
index b6d595d33bda3036c0af827a22d974f3700eaac8..90f3cc50b3f8e54b584237edfa50fcdc96d0e625 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
index 6459cb41c2b011b72a9f34267f66c966f3be40a6..0e9e09b27fa40e222d6284c58826a089391e1bb4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_anatomy"
+"task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
index 573dedd751bc6e771721c47b24bef6323fcc317c..e3bdfc9521f3b98bf684a1c022f69e3c14c9d406 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_astronomy"
+"task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
index 4b20b795a1c4d66dd364d8183520373911c4518e..ea0d1fe2a68cb893759f3c550775ff3554f4fc13 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_business_ethics"
+"task_alias": "business_ethics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
index f758e66db42d7d33105248c04b18904ad7cd0332..20bab1470e8b58b9e56652edf8128a4e7959bd33 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_clinical_knowledge"
+"task_alias": "clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
index f806900785c32d37ee309f7743d815ec46b29ed6..afb4d9c6c0d209644806ca8376a25cf28d6fa2ed 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_biology"
+"task_alias": "college_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
index e03fbccd4194dbe38c82d21ee866513095aba81a..a7de3532359a418dc0d1ba8f9096700e2a888907 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_chemistry"
+"task_alias": "college_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
index a9d4a6f2887b933e3be492783551a4c5d52d1854..9786cc6ebd0d14936bb0138d322161b01a4574dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_computer_science"
+"task_alias": "college_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
index f6a86179c7eeb112725cb95a841d6521bf789ab4..e7699f8bfcedffdcc83984f96c6d337f41135b54 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_mathematics"
+"task_alias": "college_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
index 0ea75fb31fc230ce3faa2002c0705a4ee444274c..df9e890136328d4caf3ca3c4715919358dcf92dc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_college_medicine"
+"task_alias": "college_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
index 82f13e403059ac8ce777df4f037c2696adca1100..3c5e7462e3fefe122e9df4bb5ff9458f8ae9dfc8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_college_physics"
+"task_alias": "college_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
index e9e06de2fb5ec20b135a8ff500b670c26edb4edb..df9c4a51b6ca2715b921f2a33ef4755ebfacbc46 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_computer_security"
+"task_alias": "computer_security"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
index 30ca6efe8c9503376b53b6cd1d43dbb241ecb5e3..8ab59ed10f1cf02f47eacf7930a9efcbfc230fcf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_conceptual_physics"
+"task_alias": "conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
index 680cc507b15b8f41cf55a22251df5bbbc956bff1..a974fc8462d649c27ac09192d3cf84d9b162f82d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_econometrics"
+"task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
index 8dd63b337da03f41d59ab326410d0d923ec9389d..9c45cc6109d2b53322b2303efcb1f57e99d8d60c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_electrical_engineering"
+"task_alias": "electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
index 4979ee303c2a251c644dc0e11b95a574b3becc41..2154ab65454c9234b57d76782315c6559b0e7e0f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_elementary_mathematics"
+"task_alias": "elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
index 9b73509b249e36a641e0481253db6000d9ed2bef..689d3d1f87503953be03cd54973d240b64f846f4 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_formal_logic"
+"task_alias": "formal_logic"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
index 8c43a6c9e1a9c89f70cbdb5b9bf5880edfd58960..60b5c12921ee2ad1e2200392ecc3ffaef44f4105 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_global_facts"
+"task_alias": "global_facts"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
index 453d30333c0ca1a471528c915c402b26804e4614..c7e055dcc8dcfffb2572a8f4ab1770479e84daeb 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_biology"
+"task_alias": "high_school_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
index 714ee0e53857adfa0a649778b8149642683f569a..8e9421c1a6f7271ddfc87ad099a914f81bc031e9 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_chemistry"
+"task_alias": "high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
index 9326e2598863f811b1a634ba5a5e73cb17f97346..87ec15cc3ad2bcd2d9d1d802389cb9e3d26e5abf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_computer_science"
+"task_alias": "high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
index e212cd224b6270ec519482f2340aec20d2bf9f8b..be0d696a962c38a9ffb5404ea75cb2d402f10ec0 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_european_history"
+"task_alias": "high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
index a7fffc2522f4aa8421470e77b64d5a8c05a8bc81..57c5261a8be0e498111d834b8787341104f24775 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_geography"
+"task_alias": "high_school_geography"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
index 7255d60fd76a4fec61719f03dc223901c51c04f2..2e92f152081e102b07feb394038aa0b30ca827a2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_government_and_politics"
+"task_alias": "high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
index 29d9ddd73aaebcc3ff43339bf09ef6e398494612..988d132aad3309d086e8f8549bd6710f2bd2b817 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_macroeconomics"
+"task_alias": "high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
index 035e7a12ecf363d0b196a3237b97b91b5ba75f07..f7c07a608592709c08d4e2130e9f99abd5058880 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_mathematics"
+"task_alias": "high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
index 72b1c8cf04f51c7585f7d45ef7317fb4c64f058c..5339a0235cd8e27e51c7292a7cd2201c46cbc2f1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_microeconomics"
+"task_alias": "high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
index ef8f6ca518168f26eea0e11261104d5b5c9917b8..0fae0405aa093a0fb2eaea8fa8a48d0e4e220c9a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_physics"
+"task_alias": "high_school_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
index 5c4cce750a8613cab987b0c5de60dfe5c29703ec..31ecb18eb91397d2bad070d11d141f3375c66eaf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_psychology"
+"task_alias": "high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
index 20ed42ecfbbb67e5fe0f567916d0a61b4c38a803..54d70880eff3e5aa6ab8f8b233ee45b9f30ca25c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_statistics"
+"task_alias": "high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
index 18cd48da93d33b212bb89b3a572c63bd77543e07..e4432fe4459fcaa3c1ccf311c094ceb9122d0637 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_us_history"
+"task_alias": "high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
index b17daac60fd40112f663efe06b7fb1baf7315dad..08773a20089a5551710cb82447d6ce23d65c367d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_world_history"
+"task_alias": "high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
index 080b267606af429c44e7112f774f5b572e7def7f..c9e1feb154293d0a75b25837f365ec892756a465 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_human_aging"
+"task_alias": "human_aging"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
index ca3389fe690be02d64d8bef3b19cc6e7e115cc91..715859a1e8ccd52ac406b9f9dcaf4dc80dbaa42f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_human_sexuality"
+"task_alias": "human_sexuality"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
index a3d443e01d5a358b39188d4afd7502aef6f8ef1b..68765225bbee355a00a2d0d3e9a8931624fc45dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_international_law"
+"task_alias": "international_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
index 4ba00a2ac6c619e7412ee8cc3bf3c0d93bc7e28e..e16de5c40b2a29cfc0cadeaad5ed11df9b7503e8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_jurisprudence"
+"task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
index ea45a4f302dbc100401be702dbd49f7aa55faef7..8b12057bc78ffc195b4d016bfdc1c931f08e6267 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_logical_fallacies"
+"task_alias": "logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
index 2ba6d1624491d80ba106f25471c9c31feb6d9fd5..2387d680b2d57d2d409ecc1bf744f512dd4fcef3 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_machine_learning"
+"task_alias": "machine_learning"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_management.yaml b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
index b4ea6da942f476d302446be50b7e46811126826a..d0cdc812d7e94eb51a9f1d3148b3d51253c1c92a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_management"
+"task_alias": "management"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
index afa30a0cd4ff6fd505efabf305172d8a42c9a698..a614db2977421ad88dcf4cd963647534f0d23b88 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_marketing"
+"task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
index 92095635c4514cecc6879dc819a5b6a409fc695e..5d7ce7083bc62f5859988fb76f84c4dc102175e8 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_medical_genetics"
+"task_alias": "medical_genetics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
index 94ebd1b02eb1e580260eccfd6b5c0ceb5150cf38..77e819cf831fb69381d75a1c039ef9ffaee3028f 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_miscellaneous"
+"task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
index 8bea0a1f227a5d28e3618232f62bd9da3968f31e..2df1a1ddb6302f7e563852fdd56cdb0d1668dad1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_disputes"
+"task_alias": "moral_disputes"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
index 71dcc693603d56d07067af283ea9b20d65554dd8..6da63cb24847aeaf580fc54f701326a187836517 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_scenarios"
+"task_alias": "moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
index e6b4cbcd98a6ee033ab2da44b992c1b9f0c74489..df70fbb2de2f40a5cb10a9093d4a0243e45c6ffc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_nutrition"
+"task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
index b9a0b2c5d1d633b559d421f9a2f9381102156bae..9dba09c10ebe140068f8054ee4d391f00682745d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_philosophy"
+"task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
index 7f71bd54f55489e103655d806860a967cb7f91e5..d787898c62a174ddfd3895da3b164651f30d266b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_prehistory"
+"task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
index 94ca6e6e933c8062c8377583e95d362769bd4b55..3443c336f2bc8dcf33e03bffa438a0e60a98ad3e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_accounting"
+"task_alias": "professional_accounting"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
index 074c34e699597bd0c1deec10b91abe371a80569e..f3a02631e28703dc8d3a6091175c49b9eb5676eb 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_law"
+"task_alias": "professional_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
index 2f99c316ec49f83a0be80dc20fa201c0ace50c7e..e8c49b5eb084d9cb2133c333f573ccbe3b8d9824 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_medicine"
+"task_alias": "professional_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
index 015658487fe83fff110b7cecc2e7913b1c70793f..ec48a06fe11d98facbf5c4d24e03595ceb2fb2bd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_psychology"
+"task_alias": "professional_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
index 0d46c66ec880369ca4e68a67c3936db442ddbbcd..db36fb498f6886549b420eaee34fb68dddfb126c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_public_relations"
+"task_alias": "public_relations"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
index f30dffdefc21e82dcf5faf08b03fa47e8f2a762b..072dfd704815409d0a21576fa9620b1b5cb1a15b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_security_studies"
+"task_alias": "security_studies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
index c36bd40370f33e357a125d15321c1ca1993398fd..efcbd27b8d653f2df89f52880d47fe5a461e1246 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_sociology"
+"task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
index fe8c68d8fb798ee1b29c08999dde8979f0d97e1b..d80ee94adf4fd7b74dc368696924065862b9ce9e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_social_sciences"
+"group_alias": "social_sciences"
 "include": "_default_template_yaml"
 "task": "mmlu_us_foreign_policy"
+"task_alias": "us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
index 4cbd095998b8bee59c094a6d931b163f0a9f49c2..d935f92aa80e30f773960e6127466a4d7f768448 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_other"
+"group_alias": "other"
 "include": "_default_template_yaml"
 "task": "mmlu_virology"
+"task_alias": "virology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
index 375efbae5d0483e654fe329f5e205088d7905bd6..8681354f7057d2d3ff2d68a98b3a5e6f23c960cc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
@@ -1,4 +1,8 @@
 "dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_humanities"
+"group_alias": "humanities"
 "include": "_default_template_yaml"
 "task": "mmlu_world_religions"
+"task_alias": "world_religions"
diff --git a/lm_eval/tasks/mmlu/_cot_prompts.json b/lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
similarity index 100%
rename from lm_eval/tasks/mmlu/_cot_prompts.json
rename to lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb43b0481fb94e877f232a26bd5176dff0e82ab9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_cot_fewshot
+task:
+  - mmlu_flan_cot_fewshot_stem
+  - mmlu_flan_cot_fewshot_other
+  - mmlu_flan_cot_fewshot_social_sciences
+  - mmlu_flan_cot_fewshot_humanities
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index 8461b93a69484464a419056db4486f3601a5ce1b..e340271af4388423dbdb53b3c3682c5a3898839c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -1,8 +1,8 @@
 group: mmlu_flan_cot_fewshot
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
index 5c5495915886e13fb61a34514929c790ff3576ac..f9d29bec020ef9d45514e72919969482520d0bef 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
@@ -1,5 +1,5 @@
-dataset_name: abstract_algebra
-description: "The following are multiple choice questions (with answers) about abstract\
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\nQ: Statement 1 | Every element of a group generates a cyclic subgroup\
   \ of the group. Statement 2 | The symmetric group S_10 has 10 elements.\n(A) True,\
   \ True (B) False, False (C) True, False (D) False, True\nA: Let's think step by\
@@ -36,5 +36,6 @@ description: "The following are multiple choice questions (with answers) about a
   \ x = 2, hence x^2 + 1 does not have any roots. For c = 2 the polynomial x^2 + 2\
   \ has two roots at x = 1 and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only\
   \ if c = 1. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_abstract_algebra
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
index 28ca1c4cf0404f0fab538cba43c88bd388633db2..144ffbe4fe920eaf8144f9aca3101f85aa0105fd 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
@@ -1,15 +1,15 @@
-dataset_name: anatomy
-description: "The following are multiple choice questions (with answers) about anatomy.\n\
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \nQ: Which of the following is the body cavity that contains the pituitary gland?\n\
   (A) Abdominal (B) Cranial (C) Pleural (D) Spinal\nA: Let's think step by step. We\
-  \ refer to Wikipedia articles on anatomy for help. Let\u2019s solve this problem\
-  \ step by step. The pituitary gland is the major endocrine gland attached to the\
-  \ base of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\
-  \nQ: Which of these branches of the trigeminal nerve contain somatic motor processes?\n\
+  \ refer to Wikipedia articles on anatomy for help. Let’s solve this problem step\
+  \ by step. The pituitary gland is the major endocrine gland attached to the base\
+  \ of the brain, and it is contained in the Cranial cavity. The answer is (B).\n\n\
+  Q: Which of these branches of the trigeminal nerve contain somatic motor processes?\n\
   (A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D) None\
   \ of the above\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
-  \ for help. Let\u2019s solve this problem step by step. \nWe know the following:\
-  \ (A) The supraorbital nerve (also known as the frontal nerve) is the largest branch\
+  \ for help. Let’s solve this problem step by step. \nWe know the following: (A)\
+  \ The supraorbital nerve (also known as the frontal nerve) is the largest branch\
   \ of the ophthalmic nerve and branch of ophthalmic division of the trigeminal nerve.\
   \ (B) The infraorbital nerve is a branch of the maxillary division of the trigeminal\
   \ nerve. (C) The mental nerve is a branch of the mandibular division of the trigeminal\
@@ -19,39 +19,39 @@ description: "The following are multiple choice questions (with answers) about a
   (A) excess overbite of the upper lateral incisors. (B) negative overjet of the upper\
   \ central incisors. (C) excess overjet of the upper lateral incisors. (D) excess\
   \ overjet of the upper central incisors.\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on anatomy for help. Let\u2019s solve this problem step\
-  \ by step. This is a question related to anatomy and orthodontics. Excess overjet\
-  \ is associated with Class II occlusions; therefore, we can safely eliminate (B)\
-  \ from the list, as negative overjet is often associated with Class III occlusions.\
-  \ Now, we need to determine the location of the excess overjet, and that would be\
-  \ the upper (maxillary) lateral incisors. Only (C) has the correct information.\
-  \ The answer is (C).\n\nQ: The pleura\n(A) have no sensory innervation. (B) are\
-  \ separated by a 2 mm space. (C) extend into the neck. (D) are composed of respiratory\
-  \ epithelium.\nA: Let's think step by step. We refer to Wikipedia articles on anatomy\
-  \ for help. Let\u2019s solve this problem step by step. First, recall that the pleura\
-  \ refers to the thin layer of tissue that covers the lungs and lines the interior\
-  \ wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A): \u201C\
-  The pleura have no sensory innervation.\u201D This information is not correct. The\
-  \ pleura do have a sensory innervation.\nOption (B): \u201CThe pleura are separated\
-  \ by a 2 mm space.\u201D This information is not correct. There is a very thin \u201C\
-  potential\u201D space between the layers of the pleura; however, it is typically\
-  \ filled with serous pleural fluid. \nOption (C): \u201CThe pleura extend into the\
-  \ neck.\u201D This information is actuakky true. The cervical pleura, also known\
-  \ as the dome of the pleuradome of the pleura, lines the extendsiton of the pleural\
-  \ cavity into the neck.\nOption (D): \u201CThe pleura are composed of respiratory\
-  \ epithelium.\u201D This information is not correct. The pleaura are composed of\
-  \ connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is the\
-  \ only correct answer. The answer is (C).\n\nQ: What is the embryological origin\
+  \ to Wikipedia articles on anatomy for help. Let’s solve this problem step by step.\
+  \ This is a question related to anatomy and orthodontics. Excess overjet is associated\
+  \ with Class II occlusions; therefore, we can safely eliminate (B) from the list,\
+  \ as negative overjet is often associated with Class III occlusions. Now, we need\
+  \ to determine the location of the excess overjet, and that would be the upper (maxillary)\
+  \ lateral incisors. Only (C) has the correct information. The answer is (C).\n\n\
+  Q: The pleura\n(A) have no sensory innervation. (B) are separated by a 2 mm space.\
+  \ (C) extend into the neck. (D) are composed of respiratory epithelium.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on anatomy for help. Let’s\
+  \ solve this problem step by step. First, recall that the pleura refers to the thin\
+  \ layer of tissue that covers the lungs and lines the interior wall of the chest\
+  \ cavity. Now, let’s look at each option:\nOption (A): “The pleura have no sensory\
+  \ innervation.” This information is not correct. The pleura do have a sensory innervation.\n\
+  Option (B): “The pleura are separated by a 2 mm space.” This information is not\
+  \ correct. There is a very thin “potential” space between the layers of the pleura;\
+  \ however, it is typically filled with serous pleural fluid. \nOption (C): “The\
+  \ pleura extend into the neck.” This information is actuakky true. The cervical\
+  \ pleura, also known as the dome of the pleuradome of the pleura, lines the extendsiton\
+  \ of the pleural cavity into the neck.\nOption (D): “The pleura are composed of\
+  \ respiratory epithelium.” This information is not correct. The pleaura are composed\
+  \ of connective tissue (CT).\nBecause (A), (B), and (D) are all incorrect, (D) is\
+  \ the only correct answer. The answer is (C).\n\nQ: What is the embryological origin\
   \ of the hyoid bone?\n(A) The first pharyngeal arch (B) The first and second pharyngeal\
   \ arches (C) The second pharyngeal arch (D) The second and third pharyngeal arches\n\
   A: Let's think step by step. We refer to Wikipedia articles on anatomy for help.\
-  \ Let\u2019s solve this problem step by step. The hyoid bone, which is also known\
-  \ as the hyooid, is a a small U-shaped bone located in the anterior neck. In its\
-  \ resting position, it lies between the ase of the mandible and the third cervical\
-  \ vertebrae. We know that the second and the third pharyngeal arches give rise to\
-  \ the horns of the hyoid bone; therefore, the embryological origin of the hyoid\
-  \ bone are the second and the third pharyngeal arches\u2014this information is covered\
-  \ in the last option (D). Therefore, we conclude that (D) must be the correct answer.\
-  \ The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_anatomy
+  \ Let’s solve this problem step by step. The hyoid bone, which is also known as\
+  \ the hyooid, is a a small U-shaped bone located in the anterior neck. In its resting\
+  \ position, it lies between the ase of the mandible and the third cervical vertebrae.\
+  \ We know that the second and the third pharyngeal arches give rise to the horns\
+  \ of the hyoid bone; therefore, the embryological origin of the hyoid bone are the\
+  \ second and the third pharyngeal arches—this information is covered in the last\
+  \ option (D). Therefore, we conclude that (D) must be the correct answer. The answer\
+  \ is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
index cd50fd555c4b93a21f310c592066449e3fd1f9a8..dc365959a9d20a1c727838d7f0743717d1a8d38a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
@@ -1,5 +1,5 @@
-dataset_name: astronomy
-description: "The following are multiple choice questions (with answers) about astronomy.\n\
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \nQ: Where do most short-period comets come from and how do we know?\n(A) The Kuiper\
   \ belt; short period comets tend to be in the plane of the solar system just like\
   \ the Kuiper belt. (B) The Kuiper belt; short period comets tend to come from random\
@@ -16,39 +16,40 @@ description: "The following are multiple choice questions (with answers) about a
   \ lighter on Mars. (C) It would be harder since the truck is lighter on Mars. (D)\
   \ It would be the same no matter where you are.\nA: Let's think step by step. If\
   \ we assume that there is no friction, the force needed to accelerate the truck\
-  \ is by Newton\u2019s second law only dependent on the mass of the truck. Hence\
-  \ (A), (B) and (C) are incorrect since it doesn\u2019t matter that it\u2019s on\
-  \ Mars, and (D) is the correct answer. The answer is (D).\n\nQ: Say the pupil of\
-  \ your eye has a diameter of 5 mm and you have a telescope with an aperture of 50\
-  \ cm. How much more light can the telescope gather than your eye?\n(A) 10000 times\
-  \ more (B) 100 times more (C) 1000 times more (D) 10 times more\nA: Let's think\
-  \ step by step. The amount of light is proportional to the aperture area $A = \\\
-  pi D^2/4$ for a lens with diameter $D$, so the relative amounts of light between\
-  \ the eye with diameter 5mm and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2\
-  \ = 10000$. The answer is (A).\n\nQ: Why isn't there a planet where the asteroid\
-  \ belt is located?\n(A) A planet once formed here but it was broken apart by a catastrophic\
-  \ collision. (B) There was not enough material in this part of the solar nebula\
-  \ to form a planet. (C) There was too much rocky material to form a terrestrial\
-  \ planet but not enough gaseous material to form a jovian planet. (D) Resonance\
-  \ with Jupiter prevented material from collecting together to form a planet.\nA:\
-  \ Let's think step by step. The asteroid belt is a stellar disc consisting of a\
-  \ large number of asteroids between Mars and Jupiter's orbits. The asteroids in\
-  \ this belt are affected by the gravitational pull from both other asteroids and\
-  \ nearby planets. Due to the strong gravitational force of Jupiter there are resonances\
-  \ that give rise to low density regions of asteroids known as the Kirkwood gap.\
-  \ So (B) and (C) are not correct since it\u2019s not a lack of material that prevents\
-  \ a planet from being formed, and (A) is incorrect because the Kirkwood gap would\
-  \ have prevented a planet from forming in the first place, and (D) is the correct\
-  \ option. The answer is (D).\n\nQ: Why is Mars red?\n(A) Because the surface is\
-  \ covered with heavily oxidized (\"rusted\") minerals. (B) Because the atmosphere\
-  \ scatters more light at bluer wavelengths transmitting mostly red light. (C) Because\
-  \ Mars is covered with ancient lava flows which are red in color. (D) Because flowing\
-  \ water on Mars's surface altered the surface minerals several billion years ago.\n\
-  A: Let's think step by step. Option (B) is not correct because if the red color\
-  \ was caused by the scattering off the atmosphere, then the earth with a much thicker\
-  \ atmosphere would also look red. Options (C) and (D) are not specific enough about\
-  \ why the color of the surface would be red, while (A) is correct because it explains\
-  \ that the surface is red due to the rusted materials on the surface and the red\
-  \ color comes from the rust. So the correct option is (A). The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_astronomy
+  \ is by Newton’s second law only dependent on the mass of the truck. Hence (A),\
+  \ (B) and (C) are incorrect since it doesn’t matter that it’s on Mars, and (D) is\
+  \ the correct answer. The answer is (D).\n\nQ: Say the pupil of your eye has a diameter\
+  \ of 5 mm and you have a telescope with an aperture of 50 cm. How much more light\
+  \ can the telescope gather than your eye?\n(A) 10000 times more (B) 100 times more\
+  \ (C) 1000 times more (D) 10 times more\nA: Let's think step by step. The amount\
+  \ of light is proportional to the aperture area $A = \\pi D^2/4$ for a lens with\
+  \ diameter $D$, so the relative amounts of light between the eye with diameter 5mm\
+  \ and the telescope with diameter 50mm is $(50 cm)^2/(5mm)^2 = 10000$. The answer\
+  \ is (A).\n\nQ: Why isn't there a planet where the asteroid belt is located?\n(A)\
+  \ A planet once formed here but it was broken apart by a catastrophic collision.\
+  \ (B) There was not enough material in this part of the solar nebula to form a planet.\
+  \ (C) There was too much rocky material to form a terrestrial planet but not enough\
+  \ gaseous material to form a jovian planet. (D) Resonance with Jupiter prevented\
+  \ material from collecting together to form a planet.\nA: Let's think step by step.\
+  \ The asteroid belt is a stellar disc consisting of a large number of asteroids\
+  \ between Mars and Jupiter's orbits. The asteroids in this belt are affected by\
+  \ the gravitational pull from both other asteroids and nearby planets. Due to the\
+  \ strong gravitational force of Jupiter there are resonances that give rise to low\
+  \ density regions of asteroids known as the Kirkwood gap. So (B) and (C) are not\
+  \ correct since it’s not a lack of material that prevents a planet from being formed,\
+  \ and (A) is incorrect because the Kirkwood gap would have prevented a planet from\
+  \ forming in the first place, and (D) is the correct option. The answer is (D).\n\
+  \nQ: Why is Mars red?\n(A) Because the surface is covered with heavily oxidized\
+  \ (\"rusted\") minerals. (B) Because the atmosphere scatters more light at bluer\
+  \ wavelengths transmitting mostly red light. (C) Because Mars is covered with ancient\
+  \ lava flows which are red in color. (D) Because flowing water on Mars's surface\
+  \ altered the surface minerals several billion years ago.\nA: Let's think step by\
+  \ step. Option (B) is not correct because if the red color was caused by the scattering\
+  \ off the atmosphere, then the earth with a much thicker atmosphere would also look\
+  \ red. Options (C) and (D) are not specific enough about why the color of the surface\
+  \ would be red, while (A) is correct because it explains that the surface is red\
+  \ due to the rusted materials on the surface and the red color comes from the rust.\
+  \ So the correct option is (A). The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
index 60d939a8691af2fa94487f4b485ea696585b0ae7..53e6b96d368a3670b116cdd09d55be44bac38317 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
@@ -1,5 +1,5 @@
-dataset_name: business_ethics
-description: "The following are multiple choice questions (with answers) about business\
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\nQ: In contrast to _______, _______ aim to reward favourable behaviour\
   \ by companies. The success of such campaigns have been heightened through the use\
   \ of ___________, which allow campaigns to facilitate the company in achieving _________\
@@ -7,12 +7,12 @@ description: "The following are multiple choice questions (with answers) about b
   \ Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain\
   \ technology, Charitable donations (D) Boycotts, Buycotts, Digital technology, Increased\
   \ Sales\nA: Let's think step by step. We refer to Wikipedia articles on business\
-  \ ethics for help. The sentence that best uses the possible options above is \u201C\
-  In contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies.\
+  \ ethics for help. The sentence that best uses the possible options above is “In\
+  \ contrast to *boycotts*, *buycotts* aim to reward favourable behavior by companies.\
   \ The success of such campaigns have been heightened through the use of *digital\
   \ technology*, which allow campaigns to facilitate the company in achieving *increased\
-  \ sales*.\u201D The answer is (D).\n\nQ: _______ is the direct attempt to formally\
-  \ or informally manage ethical issues or problems, through specific policies, practices\
+  \ sales*.” The answer is (D).\n\nQ: _______ is the direct attempt to formally or\
+  \ informally manage ethical issues or problems, through specific policies, practices\
   \ and programmes.\n(A) Corporate social responsibility (B) Business ethics management\
   \ (C) Sustainability (D) Environmental management\nA: Let's think step by step.\
   \ We refer to Wikipedia articles on business ethics for help. The direct attempt\
@@ -26,30 +26,31 @@ description: "The following are multiple choice questions (with answers) about b
   \ action, Violent direct action, Non-violent direct-action Boycott (D) Non-violent\
   \ direct action, Instrumental action, Indirect action, Information campaign\nA:\
   \ Let's think step by step. We refer to Wikipedia articles on business ethics for\
-  \ help. The sentence that best uses the possible options above is \u201CThree contrasting\
+  \ help. The sentence that best uses the possible options above is “Three contrasting\
   \ tactics that CSO's can engage in to meet their aims are *indirect action*, which\
   \ typically involves research and communication, *violent direct action*, which\
   \ may involve physically attacking a company's operations or *non-violent direct\
-  \ action*, often involving some form of *boycott*.\u201D The answer is (C).\n\n\
-  Q: To ensure the independence of the non-executive board members, there are a number\
+  \ action*, often involving some form of *boycott*.” The answer is (C).\n\nQ: To\
+  \ ensure the independence of the non-executive board members, there are a number\
   \ of steps which can be taken, which include non-executives being drawn from _______\
   \ the company, being appointed for a _________ time period as well as being appointed\
   \ _________.\n(A) Outside, Limited, Independently (B) Inside, Limited, Intermittently\
   \ (C) Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently\n\
   A: Let's think step by step. We refer to Wikipedia articles on business ethics for\
-  \ help. The sentence that best uses the possible options above is \u201CTo ensure\
-  \ the independence of the non-executive board members, there are a number of steps\
-  \ which can be taken, which include non-executives being draw from *outside* the\
-  \ company, being appointed for a *limited* time period as well as being imported\
-  \ *independently*. The answer is (A).\n\nQ: Beyond the business case for engaging\
-  \ in CSR there are a number of moral arguments relating to: negative _______, the\
-  \ _______that corporations possess and the ________ of business and society.\n(A)\
-  \ Externalities, Power, Independence (B) Publicity, Insubstantial resources, Mutual\
-  \ dependence (C) Publicity, Power, Independence (D) Externalities, Power, Mutual\
-  \ dependence\nA: Let's think step by step. We refer to Wikipedia articles on business\
-  \ ethics for help. The sentence that best uses the possible options above is \u201C\
-  Beyond the business case for engaging the CSR there are a number of moral arguments\
-  \ relating to: negative *externalities*, the *power* that corporations possess and\
-  \ the *mutual independence* of business and society. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_business_ethics
+  \ help. The sentence that best uses the possible options above is “To ensure the\
+  \ independence of the non-executive board members, there are a number of steps which\
+  \ can be taken, which include non-executives being draw from *outside* the company,\
+  \ being appointed for a *limited* time period as well as being imported *independently*.\
+  \ The answer is (A).\n\nQ: Beyond the business case for engaging in CSR there are\
+  \ a number of moral arguments relating to: negative _______, the _______that corporations\
+  \ possess and the ________ of business and society.\n(A) Externalities, Power, Independence\
+  \ (B) Publicity, Insubstantial resources, Mutual dependence (C) Publicity, Power,\
+  \ Independence (D) Externalities, Power, Mutual dependence\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on business ethics for help. The sentence\
+  \ that best uses the possible options above is “Beyond the business case for engaging\
+  \ the CSR there are a number of moral arguments relating to: negative *externalities*,\
+  \ the *power* that corporations possess and the *mutual independence* of business\
+  \ and society. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
index a591a7bd4c0f4e788ba7db1f5cfb422709eef18f..6f73eda0118146c2886a1d8bcb43b41a99ec597c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
@@ -1,58 +1,35 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
-
-
-  Q: Glycolysis is the name given to the pathway involving the conversion of:
-
-  (A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C) glycogen
-  or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate or acetyl
-  CoA.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. Glycolysis is the name given to the pathway involving conversion of glycogen
-  or glucose to pyruvate or lactate. The answer is (C).
-
-
-  Q: What is the difference between a male and a female catheter?
-
-  (A) Male and female catheters are different colours. (B) Male catheters are longer
-  than female catheters. (C) Male catheters are bigger than female catheters. (D)
-  Female catheters are longer than male catheters.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. The difference between a male and female catheter is that male catheters
-  tend to be longer than female catheters. The answer is (B).
-
-
-  Q: How many attempts should you make to cannulate a patient before passing the job
-  on to a senior colleague, according to the medical knowledge of 2020?
-
-  (A) 4 (B) 3 (C) 2 (D) 1
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. According to the medical protocol as of 2020, you should make two attempts
-  to cannulate a patient before passing the job on to a more-senior practitioner.
-  The answer is (C).
-
-
-  Q: In the assessment of the hand function which of the following is true?
-
-  (A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the thumb
-  by opponens policis is supplied by spinal root T1 (C) Finger adduction is supplied
-  by the median nerve (D) Finger abduction is mediated by the palmar interossei
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. Of all the options, it is only true that the opposition of the thumb by
-  opponens pollicis is supplied by spinal root T1. The answer is (B).
-
-
-  Q: The energy for all forms of muscle contraction is provided by:
-
-  (A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on clinical knowledge
-  for help. The energy for muscular contraction is provided by ATP (adenosine triphosphate),
-  which is the powerhouse of the cell. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_clinical_knowledge
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\nQ: Glycolysis is the name given to the pathway involving the conversion\
+  \ of:\n(A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose.\
+  \ (C) glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate\
+  \ or acetyl CoA.\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ clinical knowledge for help. Glycolysis is the name given to the pathway involving\
+  \ conversion of glycogen or glucose to pyruvate or lactate. The answer is (C).\n\
+  \nQ: What is the difference between a male and a female catheter?\n(A) Male and\
+  \ female catheters are different colours. (B) Male catheters are longer than female\
+  \ catheters. (C) Male catheters are bigger than female catheters. (D) Female catheters\
+  \ are longer than male catheters.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. The difference between a male and female\
+  \ catheter is that male catheters tend to be longer than female catheters. The answer\
+  \ is (B).\n\nQ: How many attempts should you make to cannulate a patient before\
+  \ passing the job on to a senior colleague, according to the medical knowledge of\
+  \ 2020?\n(A) 4 (B) 3 (C) 2 (D) 1\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. According to the medical protocol as\
+  \ of 2020, you should make two attempts to cannulate a patient before passing the\
+  \ job on to a more-senior practitioner. The answer is (C).\n\nQ: In the assessment\
+  \ of the hand function which of the following is true?\n(A) Abduction of the thumb\
+  \ is supplied by spinal root T2 (B) Opposition of the thumb by opponens policis\
+  \ is supplied by spinal root T1 (C) Finger adduction is supplied by the median nerve\
+  \ (D) Finger abduction is mediated by the palmar interossei\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on clinical knowledge for help. Of all\
+  \ the options, it is only true that the opposition of the thumb by opponens pollicis\
+  \ is supplied by spinal root T1. The answer is (B).\n\nQ: The energy for all forms\
+  \ of muscle contraction is provided by:\n(A) ATP. (B) ADP. (C) phosphocreatine.\
+  \ (D) oxidative phosphorylation.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on clinical knowledge for help. The energy for muscular contraction is\
+  \ provided by ATP (adenosine triphosphate), which is the powerhouse of the cell.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
index be51794a6867d8fd1b05b09ad95c85ae71087992..1cd13c56649fcec8255635d4e4dde1827e27d12f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
@@ -1,5 +1,5 @@
-dataset_name: college_biology
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\nQ: Which of the following represents an accurate statement concerning\
   \ arthropods?\n(A) They possess an exoskeleton composed primarily of peptidoglycan.\
   \ (B) They possess an open circulatory system with a dorsal heart. (C) They are\
@@ -19,7 +19,7 @@ description: "The following are multiple choice questions (with answers) about c
   \ Law, $p^2 + 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the\
   \ dominant allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$,\
   \ and $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and\
-  \ heterozygous individuals, respectively. \u200BThe frequency of the recessive allele\
+  \ heterozygous individuals, respectively. ​The frequency of the recessive allele\
   \ (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$. The frequency\
   \ of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95 = 0.095$. The\
   \ number of heterozygous individuals is equal to the frequency of heterozygous individuals\
@@ -56,5 +56,6 @@ description: "The following are multiple choice questions (with answers) about c
   \ the human and bird forearms, which rules out (D). Humans and birds do belong to\
   \ the same clade - a group of organisms composed of a common ancestor. The answer\
   \ is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_biology
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
index a02c909e8a4b75ab8d67d7f0ad723a7c2e8fd3d3..08f002b5447c6e21090eb845fe19d6f56faef75e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
@@ -1,37 +1,38 @@
-dataset_name: college_chemistry
-description: "The following are multiple choice questions (with answers) about college\
-  \ chemistry.\n\nQ: 3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq)\
-  \ + 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves as\n\
-  (A) an acid (B) a base (C) a catalyst (D) a reducing agent\nA: Let's think step\
-  \ by step. A molecule that behaves as a base accepts an H+ ion (or proton) from\
-  \ another molecule, whereas a molecule that behaves as an acid donates an H+ ion\
-  \ (or proton) to another molecule. Neither of these is the case for Cl in this reaction,\
-  \ which rules out (A) and (B). A catalyst is a substance that only accelerates a\
-  \ reaction without itself undergoing chemical change, which is not the case here.\
-  \ This rules out (C). Instead, the $Cl^{-} molecules carry a negative charge, which\
-  \ they donate in the reaction to form 3 HClO2. This is the behavior of a reducing\
-  \ agent, or (D). The answer is (D).\n\nQ: Which of the following statements about\
-  \ the lanthanide elements is NOT true?\n(A) The most common oxidation state for\
-  \ the lanthanide elements is +3. (B) Lanthanide complexes often have high coordination\
-  \ numbers (> 6). (C) All of the lanthanide elements react with aqueous acid to liberate\
-  \ hydrogen. (D) The atomic radii of the lanthanide elements increase across the\
-  \ period from La to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide\
-  \ elements in fact decrease across the period from La to Lu. Options (A), (B), and\
-  \ (C) are all true. This means that only (D) is NOT true. The answer is (D).\n\n\
-  Q: Which of the following lists the hydrides of group-14 elements in order of thermal\
-  \ stability, from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4\
-  \ < SnH4 < CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4\
-  \ < GeH4 < SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14\
-  \ hydrides decreases as we move from the top of group 14 to the bottom. The order\
-  \ of elements in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in\
-  \ order of increasing thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4,\
-  \ or answer (A). The answer is (A).\n\nQ: Predict the number of lines in the EPR\
-  \ spectrum of a solution of 13C-labelled methyl radical (13CH3\u2022), assuming\
-  \ the lines do not overlap.\n(A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step\
-  \ by step. The electron paramagnetic resonance spectrum will be split by two forms\
-  \ of interactions. The first is the hyperfine interaction with the 13C (nuclear\
-  \ spin $I = \nrac{1}{2}$) which will split the spectrum into 2 lines. This will\
-  \ be further split into 4 lines by the interaction with three equivalent 1H nuclei.\
-  \ The total number of lines is therefore $2 \\cdot 4 = 8$. The answer is (E)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_chemistry
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\nQ: 3 Cl−(aq) + 4 CrO_4^2−(aq) + 23 H+(aq) → 3 HClO2(aq) + 4 Cr3+(aq)\
+  \ + 10 H2O(l). In the reaction shown above, Cl−(aq) behaves as\n(A) an acid (B)\
+  \ a base (C) a catalyst (D) a reducing agent\nA: Let's think step by step. A molecule\
+  \ that behaves as a base accepts an H+ ion (or proton) from another molecule, whereas\
+  \ a molecule that behaves as an acid donates an H+ ion (or proton) to another molecule.\
+  \ Neither of these is the case for Cl in this reaction, which rules out (A) and\
+  \ (B). A catalyst is a substance that only accelerates a reaction without itself\
+  \ undergoing chemical change, which is not the case here. This rules out (C). Instead,\
+  \ the $Cl^{-} molecules carry a negative charge, which they donate in the reaction\
+  \ to form 3 HClO2. This is the behavior of a reducing agent, or (D). The answer\
+  \ is (D).\n\nQ: Which of the following statements about the lanthanide elements\
+  \ is NOT true?\n(A) The most common oxidation state for the lanthanide elements\
+  \ is +3. (B) Lanthanide complexes often have high coordination numbers (> 6). (C)\
+  \ All of the lanthanide elements react with aqueous acid to liberate hydrogen. (D)\
+  \ The atomic radii of the lanthanide elements increase across the period from La\
+  \ to Lu.\nA: Let's think step by step. The atomic radii of the lanthanide elements\
+  \ in fact decrease across the period from La to Lu. Options (A), (B), and (C) are\
+  \ all true. This means that only (D) is NOT true. The answer is (D).\n\nQ: Which\
+  \ of the following lists the hydrides of group-14 elements in order of thermal stability,\
+  \ from lowest to highest?\n(A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 <\
+  \ CH4 < GeH4 < SiH4 (C) CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 <\
+  \ SnH4 < SiH4\nA: Let's think step by step. The thermal stability of group-14 hydrides\
+  \ decreases as we move from the top of group 14 to the bottom. The order of elements\
+  \ in the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing\
+  \ thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The\
+  \ answer is (A).\n\nQ: Predict the number of lines in the EPR spectrum of a solution\
+  \ of 13C-labelled methyl radical (13CH3•), assuming the lines do not overlap.\n\
+  (A) 4 (B) 3 (C) 6 (D) 24 (E) 8\nA: Let's think step by step. The electron paramagnetic\
+  \ resonance spectrum will be split by two forms of interactions. The first is the\
+  \ hyperfine interaction with the 13C (nuclear spin $I = \nrac{1}{2}$) which will\
+  \ split the spectrum into 2 lines. This will be further split into 4 lines by the\
+  \ interaction with three equivalent 1H nuclei. The total number of lines is therefore\
+  \ $2 \\cdot 4 = 8$. The answer is (E)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
index 20b398c15c591b4376c3daafd2255357b8b3546f..e3a207054e1a656d0066456c79fa4a6b916d462a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
@@ -1,189 +1,79 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
-
-
-  Q: Which of the following regular expressions is equivalent to (describes the same
-  set of strings as) (a* + b)*(c + d)?
-
-  (A) a*(c + d)+ b(c + d)
-
-  (B) a*(c + d)* + b(c + d)*
-
-  (C) a*(c + d)+ b*(c + d)
-
-  (D) (a + b)*c +(a + b)*d
-
-  A: Let''s think step by step. We know that:
-
-  1. (X* + Y)* = (X + Y)*
-
-  2. X(Y + Z)? = XY + XZ
-
-  Using equation 1 we can rewrite (a* + b)*(c + d)? as:
-
-  3. (a + b)*(c + d)?
-
-  Using equation 2 we can rewrite equation 3 as:
-
-  (a + b)*c + (a + b)*d The answer is (D).
-
-
-  Q: The Singleton design pattern is used to guarantee that only a single instance
-  of a class may be instantiated. Which of the following is (are) true of this design
-  pattern?
-
-  I. The Singleton class has a static factory method to provide its instance.
-
-  II. The Singleton class can be a subclass of another class.
-
-  III. The Singleton class has a private constructor.
-
-  (A) I only
-
-  (B) II only
-
-  (C) III only
-
-  (D) I, II, and III
-
-  A: Let''s think step by step. Statement I is a correct statement about a Singleton,
-  because a Singleton restricts instantiation to a single, static method. Statement
-  II is also correct, because there is no inherent restriction regarding the inheritance
-  of a Singleton. Statement III is also correct, because a Singletons must be instantiated
-  only once, so its constructor is made private to prevent any construction except
-  via its static factory method.
-
-  Given these facts, statements I, II, and III are all correct. The answer is (D).
-
-
-  Q: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, . .
-  . , R7 and supports the following operations:
-
-  ADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd)
-
-  MUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd)
-
-  An operation normally takes one cycle; however, an operation takes two cycles if
-  it produces a result required by the immediately following operation in an operation
-  sequence.
-
-  Consider the expression AB + ABC + BC, where variables A, B, C are located in registers
-  R0, R1, R2. If the contents of these three registers must not be modified, what
-  is the minimum number of clock cycles required for an operation sequence that computes
-  the value of AB + ABC + BC?
-
-  (A) 5 (B) 6 (C) 7 (D) 8
-
-  A: Let''s think step by step. First, we are given that A is in R0, B is in R1, and
-  C is in R2.
-
-  Next, we can see that we must compute three multiplies (AB, BC, and ABC) and two
-  adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting in a minimum
-  of five clock cycles.
-
-  Next, we can see that there is no way to avoid at least one pipeline stall when
-  computing our final answer, because to compute our final sum we must wait at least
-  one cycle for the results from the previous stage to be ready. Thus, our minimum
-  number of cycles must be 6.
-
-  We can verify that we can create a solution that requires only six cycles as follows:
-
-  compute AB: MUL R0, R1, R3
-
-  compute BC: MUL R1, R2, R4
-
-  compute ABC: MUL R3, R4, R5
-
-  compute AB + BC: ADD R3, R4, R6
-
-  STALL
-
-  compute AB + ABC + BC: ADD R5, R6, R7
-
-  So there are 6 cycles. The answer is (B).
-
-
-  Q: A compiler generates code for the following assignment statement.
-
-  G := (A + B) * C - (D + E) * F
-
-  The target machine has a single accumulator and a single-address instruction set
-  consisting of instructions load, store, add, subtract, and multiply. For the arithmetic
-  operations, the left operand is taken from the accumulator and the result appears
-  in the accumulator. The smallest possible number of instructions in the resulting
-  code is
-
-  (A) 5 (B) 6 (C) 7 (D) 9
-
-  A: Let''s think step by step. We can compute the final answer with the following
-  sequence of operations:
-
-  1. LOAD D  (accumulator = D)
-
-  2. ADD E  (accumulator = D+E)
-
-  3. MUL F  (accumulator = (D+E)*F)
-
-  4. STORE X (X = (D+E)*F)
-
-  5. LOAD A  (accumulator = A)
-
-  6. ADD B  (accumulator = A+B)
-
-  7. MUL C  (accumulator = (A+B)*C)
-
-  8. SUB X  (accumulator = (A+B)*C - (D+E)*F)
-
-  9. STORE G (G = (A+B)*C - (D+E)*F)
-
-  This sequence takes 9 instructions. The answer is (D).
-
-
-  Q: Consider a computer design in which multiple processors, each with a private
-  cache memory, share global memory using a single bus. This bus is the critical system
-  resource. Each processor can execute one instruction every 500 nanoseconds as long
-  as memory references are satisfied by its local cache. When a cache miss occurs,
-  the processor is delayed for an additional 2,000 nanoseconds. During half of this
-  additional delay, the bus is dedicated to serving the cache miss. During the other
-  half, the processor cannot continue, but the bus is free to service requests from
-  other processors. On average, each instruction requires 2 memory references. On
-  average, cache misses occur on 1 percent of references. What proportion of the capacity
-  of the bus would a single processor consume, ignoring delays due to competition
-  from other processors?
-
-  (A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27
-
-  A: Let''s think step by step. We know that each instruction requires two memory
-  references per instruction, and that there is an average cache miss rate of one
-  percent.
-
-  Thus a given processor has:
-
-  (1 cache miss / 100 references) * (2 references / instruction) =
-
-  (2 cache misses / 100 instructions), so:
-
-  misses_per_instruction = 1 cache miss / 50 instructions.
-
-  Next, we know that each instruction requires 500 nanoseconds when there is no cache
-  miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus:
-
-  50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so:
-
-  instructions_per_ns = 50 instructions / 27000 nanoseconds.
-
-  Now, we know that each cache miss locks the bus for half of the 2000 nanosecond
-  cache miss delay, or 1000 nanoseconds, so:
-
-  lock_ns_per_miss = 1000 nanoseconds / cache miss.
-
-  Thus we can see that on average a single processor will lock the bus for:
-
-  lock_ns_per_miss * misses_per_instruction * instructions_per_ns =
-
-  (1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions
-  / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer
-  is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_computer_science
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\nQ: Which of the following regular expressions is equivalent\
+  \ to (describes the same set of strings as) (a* + b)*(c + d)?\n(A) a*(c + d)+ b(c\
+  \ + d)\n(B) a*(c + d)* + b(c + d)*\n(C) a*(c + d)+ b*(c + d)\n(D) (a + b)*c +(a\
+  \ + b)*d\nA: Let's think step by step. We know that:\n1. (X* + Y)* = (X + Y)*\n\
+  2. X(Y + Z)? = XY + XZ\nUsing equation 1 we can rewrite (a* + b)*(c + d)? as:\n\
+  3. (a + b)*(c + d)?\nUsing equation 2 we can rewrite equation 3 as:\n(a + b)*c +\
+  \ (a + b)*d The answer is (D).\n\nQ: The Singleton design pattern is used to guarantee\
+  \ that only a single instance of a class may be instantiated. Which of the following\
+  \ is (are) true of this design pattern?\nI. The Singleton class has a static factory\
+  \ method to provide its instance.\nII. The Singleton class can be a subclass of\
+  \ another class.\nIII. The Singleton class has a private constructor.\n(A) I only\n\
+  (B) II only\n(C) III only\n(D) I, II, and III\nA: Let's think step by step. Statement\
+  \ I is a correct statement about a Singleton, because a Singleton restricts instantiation\
+  \ to a single, static method. Statement II is also correct, because there is no\
+  \ inherent restriction regarding the inheritance of a Singleton. Statement III is\
+  \ also correct, because a Singletons must be instantiated only once, so its constructor\
+  \ is made private to prevent any construction except via its static factory method.\n\
+  Given these facts, statements I, II, and III are all correct. The answer is (D).\n\
+  \nQ: A certain pipelined RISC machine has 8 general-purpose registers R0, R1, .\
+  \ . . , R7 and supports the following operations:\nADD Rs1, Rs2, Rd (Add Rs1 to\
+  \ Rs2 and put the sum in Rd)\nMUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the\
+  \ product in Rd)\nAn operation normally takes one cycle; however, an operation takes\
+  \ two cycles if it produces a result required by the immediately following operation\
+  \ in an operation sequence.\nConsider the expression AB + ABC + BC, where variables\
+  \ A, B, C are located in registers R0, R1, R2. If the contents of these three registers\
+  \ must not be modified, what is the minimum number of clock cycles required for\
+  \ an operation sequence that computes the value of AB + ABC + BC?\n(A) 5 (B) 6 (C)\
+  \ 7 (D) 8\nA: Let's think step by step. First, we are given that A is in R0, B is\
+  \ in R1, and C is in R2.\nNext, we can see that we must compute three multiplies\
+  \ (AB, BC, and ABC) and two adds (AB + ABC, (AB + ABC) + BC) to compute our final\
+  \ answer, resulting in a minimum of five clock cycles.\nNext, we can see that there\
+  \ is no way to avoid at least one pipeline stall when computing our final answer,\
+  \ because to compute our final sum we must wait at least one cycle for the results\
+  \ from the previous stage to be ready. Thus, our minimum number of cycles must be\
+  \ 6.\nWe can verify that we can create a solution that requires only six cycles\
+  \ as follows:\ncompute AB: MUL R0, R1, R3\ncompute BC: MUL R1, R2, R4\ncompute ABC:\
+  \ MUL R3, R4, R5\ncompute AB + BC: ADD R3, R4, R6\nSTALL\ncompute AB + ABC + BC:\
+  \ ADD R5, R6, R7\nSo there are 6 cycles. The answer is (B).\n\nQ: A compiler generates\
+  \ code for the following assignment statement.\nG := (A + B) * C - (D + E) * F\n\
+  The target machine has a single accumulator and a single-address instruction set\
+  \ consisting of instructions load, store, add, subtract, and multiply. For the arithmetic\
+  \ operations, the left operand is taken from the accumulator and the result appears\
+  \ in the accumulator. The smallest possible number of instructions in the resulting\
+  \ code is\n(A) 5 (B) 6 (C) 7 (D) 9\nA: Let's think step by step. We can compute\
+  \ the final answer with the following sequence of operations:\n1. LOAD D  (accumulator\
+  \ = D)\n2. ADD E  (accumulator = D+E)\n3. MUL F  (accumulator = (D+E)*F)\n4. STORE\
+  \ X (X = (D+E)*F)\n5. LOAD A  (accumulator = A)\n6. ADD B  (accumulator = A+B)\n\
+  7. MUL C  (accumulator = (A+B)*C)\n8. SUB X  (accumulator = (A+B)*C - (D+E)*F)\n\
+  9. STORE G (G = (A+B)*C - (D+E)*F)\nThis sequence takes 9 instructions. The answer\
+  \ is (D).\n\nQ: Consider a computer design in which multiple processors, each with\
+  \ a private cache memory, share global memory using a single bus. This bus is the\
+  \ critical system resource. Each processor can execute one instruction every 500\
+  \ nanoseconds as long as memory references are satisfied by its local cache. When\
+  \ a cache miss occurs, the processor is delayed for an additional 2,000 nanoseconds.\
+  \ During half of this additional delay, the bus is dedicated to serving the cache\
+  \ miss. During the other half, the processor cannot continue, but the bus is free\
+  \ to service requests from other processors. On average, each instruction requires\
+  \ 2 memory references. On average, cache misses occur on 1 percent of references.\
+  \ What proportion of the capacity of the bus would a single processor consume, ignoring\
+  \ delays due to competition from other processors?\n(A) 1/50 (B) 1/27 (C) 1/25 (D)\
+  \ 2/27\nA: Let's think step by step. We know that each instruction requires two\
+  \ memory references per instruction, and that there is an average cache miss rate\
+  \ of one percent.\nThus a given processor has:\n(1 cache miss / 100 references)\
+  \ * (2 references / instruction) =\n(2 cache misses / 100 instructions), so:\nmisses_per_instruction\
+  \ = 1 cache miss / 50 instructions.\nNext, we know that each instruction requires\
+  \ 500 nanoseconds when there is no cache miss, and 500 + 2000 = 2500 nanoseconds\
+  \ when there is a cache miss. Thus:\n50 instructions / (49 * 500) + (1 * 2500) nanoseconds,\
+  \ so:\ninstructions_per_ns = 50 instructions / 27000 nanoseconds.\nNow, we know\
+  \ that each cache miss locks the bus for half of the 2000 nanosecond cache miss\
+  \ delay, or 1000 nanoseconds, so:\nlock_ns_per_miss = 1000 nanoseconds / cache miss.\n\
+  Thus we can see that on average a single processor will lock the bus for:\nlock_ns_per_miss\
+  \ * misses_per_instruction * instructions_per_ns =\n(1000 nanoseconds / cache miss)\
+  \ * (1 cache miss / 50 instructions) * (50 instructions / 27000 nanoseconds) = 1000\
+  \ * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
index 4442f9ed12e941c7ae0e4bd6ec7f9622c3084555..9d5d975eb3ca8bad6ed8ef52ee52469e829d1b20 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
@@ -1,49 +1,50 @@
-dataset_name: college_mathematics
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\nQ: Let V be the set of all real polynomials p(x). Let transformations\
   \ T, S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p'(x) = d/dx p(x), and interpret\
   \ (ST)(p(x)) as S(T(p(x))). Which of the following is true?\n(A) ST = 0 (B) ST =\
   \ T (C) ST = TS (D) ST - TS is the identity map of V onto itself.\nA: Let's think\
-  \ step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))\u2019 = p(x)\
-  \ + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\nHence \\[ST(p) - TS(p) = p(x)\
-  \ + xp\u2019(x) - xp\u2019(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x)\
-  \ = f(x) for all real x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11\
-  \ (B) 0 (C) 11 (D) 33/2\nA: Let's think step by step. The only polynomial so that\
-  \ $f(1 + x) = f(x)$ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer\
-  \ is (C).\n\nQ: Let A be a real 2x2 matrix. Which of the following statements must\
-  \ be true?\nI. All of the entries of A^2 are nonnegative.\nII. The determinant of\
-  \ A^2 is nonnegative.\nIII. If A has two distinct eigenvalues, then A^2 has two\
-  \ distinct eigenvalues.\n(A) I only (B) II only (C) III only (D) II and III only\n\
-  A: Let's think step by step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence\
-  \ II holds.\nIII is false: as a counterexample take a diagonal matrix with -1 and\
-  \ 1 on the diagonal. Then $A^2$ is the identity matrix. The answer is (B).\n\nQ:\
-  \ Let A be the set of all ordered pairs of integers (m, n) such that 7m + 12n =\
-  \ 22. What is the greatest negative number in the set B = {m + n : (m, n) \\in A}?\n\
-  (A) -5 (B) -4 (C) -3 (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m\
-  \ and one of the solutions is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need\
-  \ to look for smaller $m$ in order to make $m + n$ negative. The next solution is\
-  \ $m = -14$ and $n = 10$. For smaller $m$ we have $m + n$ smaller than $-4$. The\
-  \ answer is (B).\n\nQ: A tank initially contains a salt solution of 3 grams of salt\
-  \ dissolved in 100 liters of water. A salt solution containing 0.02 grams of salt\
-  \ per liter of water is sprayed into the tank at a rate of 4 liters per minute.\
-  \ The sprayed solution is continually mixed with the salt solution in the tank,\
-  \ and the mixture flows out of the tank at a rate of 4 liters per minute. If the\
-  \ mixing is instantaneous, how many grams of salt are in the tank after 100 minutes\
-  \ have elapsed?\n(A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step\
-  \ by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt\
-  \ in the tank at the $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably.\
-  \ We also use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed\
-  \ into the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always\
-  \ 100 liters of liquid in the tank, containing $s$ grams of salt. So the density\
-  \ of salt in the tank is $s / 100$ grams per liter. The flow of water out of the\
-  \ tank therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for\
-  \ all $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$,\
-  \ and so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in\
-  \ \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\f\
-  rac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that,\
-  \ for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then,\
-  \ for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K\
-  \ e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100\
-  \ / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_mathematics
+  \ step by step. For a given polynomial $p$ we have\n\\[ST(p) = (xp(x))’ = p(x) +\
+  \ xp’(x)\\]\nand\n\\[TS(p) = xp’(x).\\]\nHence \\[ST(p) - TS(p) = p(x) + xp’(x)\
+  \ - xp’(x).\\] The answer is (D).\n\nQ: Suppose that f(1 + x) = f(x) for all real\
+  \ x. If f is a polynomial and f(5) = 11, then f(15/2)\n(A) -11 (B) 0 (C) 11 (D)\
+  \ 33/2\nA: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$\
+  \ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).\n\n\
+  Q: Let A be a real 2x2 matrix. Which of the following statements must be true?\n\
+  I. All of the entries of A^2 are nonnegative.\nII. The determinant of A^2 is nonnegative.\n\
+  III. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.\n\
+  (A) I only (B) II only (C) III only (D) II and III only\nA: Let's think step by\
+  \ step. We have \\[ det(A^2) = (det(A))^2 \\geq 0,\\] hence II holds.\nIII is false:\
+  \ as a counterexample take a diagonal matrix with -1 and 1 on the diagonal. Then\
+  \ $A^2$ is the identity matrix. The answer is (B).\n\nQ: Let A be the set of all\
+  \ ordered pairs of integers (m, n) such that 7m + 12n = 22. What is the greatest\
+  \ negative number in the set B = {m + n : (m, n) \\in A}?\n(A) -5 (B) -4 (C) -3\
+  \ (D) -2\nA: Let's think step by step. We have 12n = 22 - 7m and one of the solutions\
+  \ is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$\
+  \ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$.\
+  \ For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).\n\nQ: A\
+  \ tank initially contains a salt solution of 3 grams of salt dissolved in 100 liters\
+  \ of water. A salt solution containing 0.02 grams of salt per liter of water is\
+  \ sprayed into the tank at a rate of 4 liters per minute. The sprayed solution is\
+  \ continually mixed with the salt solution in the tank, and the mixture flows out\
+  \ of the tank at a rate of 4 liters per minute. If the mixing is instantaneous,\
+  \ how many grams of salt are in the tank after 100 minutes have elapsed?\n(A) 2\
+  \ (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4\nA: Let's think step by step. For all $t\
+  \ \\in \\mathbb{R}$, let $s(t)$ denote the number grams of salt in the tank at the\
+  \ $t$ minute mark. Then $s(0) = 3$.\nWe use $s$ and $s(t)$ interchangeably. We also\
+  \ use $s^{\\prime}$ and $s^{\\prime}(t)$ interchangeably. The solution sprayed into\
+  \ the tank adds $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100\
+  \ liters of liquid in the tank, containing $s$ grams of salt. So the density of\
+  \ salt in the tank is $s / 100$ grams per liter. The flow of water out of the tank\
+  \ therefore subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all\
+  \ $t \\in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and\
+  \ so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in \\mathbb{R}$,\n\
+  $$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\frac{d}{d t}\\\
+  left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such that, for all\
+  \ $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$. Then, for all\
+  \ $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and so $s(t)=2+K e^{-t\
+  \ / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K e^{-100 / 25}=2+1\
+  \ \\cdot e^{-4}=2+e^{-4}$. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
index 8f3ae14e1c12c3f6d00ebe87e17c227641dba9db..68c7f4343cb2e49b05137defc9aaa69bc20b93c9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
@@ -1,5 +1,5 @@
-dataset_name: college_medicine
-description: "The following are multiple choice questions (with answers) about college\
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\nQ: An expected side effect of creatine supplementation is:\n(A) muscle\
   \ weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.\n\
   A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
@@ -9,44 +9,44 @@ description: "The following are multiple choice questions (with answers) about c
   \ endurance runners have a high proportion of Type I fibres in their leg muscles\
   \ (C) Liver glycogen is important in the maintenance of the blood glucose concentration\
   \ (D) Insulin promotes glucose uptake by all tissues in the body\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on medicine for help. Let\u2019s\
-  \ solve this step by step and go over each choice: \n(A) \u201CMuscle glycogen is\
-  \ broken down enzymatically to glucose-1-phosphate\u201D: This is a correct statement.\n\
-  (B) \u201CElite endurance runners have a high proportion of Type I fibres in their\
-  \ leg muscles\u201D: This is a correct statement.\n(C) \u201CLiver glycogen is important\
-  \ in the maintenance of the blood glucose concentration\u201D: This is a correct\
-  \ statement. \n(D) \u201CInsulin promotes glucose uptake by all tissues in the body\u201D\
-  : This is not a correct statement, because insulin promotes glucose uptake by the\
-  \ liver, adipose tissue, and muscle, but not all tissues. For instance, the tissues\
-  \ in the brain and red blood cells are not affected by insulin. The answer is (D).\n\
-  \nQ: A high school science teacher fills a 1 liter bottle with pure nitrogen and\
-  \ seals the lid. The pressure is 1.70 atm, and the room temperature is 25\xB0C.\
-  \ Which two variables will both increase the pressure of the system, if all other\
-  \ variables are held constant?\n(A) Increasing temperature, increasing moles of\
-  \ gas (B) Increasing temperature, increasing volume (C) Decreasing volume, decreasing\
-  \ temperature (D) Decreasing moles of gas, increasing volume\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on medicine for help. The relevant equation\
-  \ for this is the ideal gas law: PV=nRT. To increase the pressure of the system\
-  \ (P), then either n (number of moles of the gas) or T (temperature) have to increase.\
-  \ The answer is (A).\n\nQ: In a genetic test of a newborn, a rare genetic disorder\
-  \ is found that has X-linked recessive transmission. Which of the following statements\
-  \ is likely true regarding the pedigree of this disorder?\n(A) All descendants on\
-  \ the maternal side will have the disorder. (B) Females will be approximately twice\
-  \ as affected as males in this family. (C) All daughters of an affected male will\
-  \ be affected. (D) There will be equal distribution of males and females affected.\n\
-  A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
-  \ Let\u2019s solve this step by step. Let's recall first that females have two X\
-  \ chromosomes, while males have one X and one Y chromosome. This is an important\
-  \ fact we need to know before answering this question. \nBecause a male can only\
-  \ pass his only one X chromosome to a daughter, if he is affected by this rare genetic\
-  \ disorder, then we know for sure that he will pass this rare genetic disorder to\
-  \ all his future-born daughters. Therefore, \u201C(C): All daughters of an affected\
-  \ male will be affected\u201D is a correct statement. The answer is (C).\n\nQ: Glucose\
-  \ is transported into the muscle cell:\n(A) via protein transporters called GLUT4.\
-  \ (B) only in the presence of insulin. (C) via hexokinase. (D) via monocarbylic\
-  \ acid transporters.\nA: Let's think step by step. We refer to Wikipedia articles\
-  \ on medicine for help. Glucose (also known as the blood sugar) is the main sugar\
-  \ found in the human body. It is transported into the muscle cell via diffusion\
-  \ through protein transporters called GLUT4. The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_medicine
+  \ step by step. We refer to Wikipedia articles on medicine for help. Let’s solve\
+  \ this step by step and go over each choice: \n(A) “Muscle glycogen is broken down\
+  \ enzymatically to glucose-1-phosphate”: This is a correct statement.\n(B) “Elite\
+  \ endurance runners have a high proportion of Type I fibres in their leg muscles”:\
+  \ This is a correct statement.\n(C) “Liver glycogen is important in the maintenance\
+  \ of the blood glucose concentration”: This is a correct statement. \n(D) “Insulin\
+  \ promotes glucose uptake by all tissues in the body”: This is not a correct statement,\
+  \ because insulin promotes glucose uptake by the liver, adipose tissue, and muscle,\
+  \ but not all tissues. For instance, the tissues in the brain and red blood cells\
+  \ are not affected by insulin. The answer is (D).\n\nQ: A high school science teacher\
+  \ fills a 1 liter bottle with pure nitrogen and seals the lid. The pressure is 1.70\
+  \ atm, and the room temperature is 25°C. Which two variables will both increase\
+  \ the pressure of the system, if all other variables are held constant?\n(A) Increasing\
+  \ temperature, increasing moles of gas (B) Increasing temperature, increasing volume\
+  \ (C) Decreasing volume, decreasing temperature (D) Decreasing moles of gas, increasing\
+  \ volume\nA: Let's think step by step. We refer to Wikipedia articles on medicine\
+  \ for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase\
+  \ the pressure of the system (P), then either n (number of moles of the gas) or\
+  \ T (temperature) have to increase. The answer is (A).\n\nQ: In a genetic test of\
+  \ a newborn, a rare genetic disorder is found that has X-linked recessive transmission.\
+  \ Which of the following statements is likely true regarding the pedigree of this\
+  \ disorder?\n(A) All descendants on the maternal side will have the disorder. (B)\
+  \ Females will be approximately twice as affected as males in this family. (C) All\
+  \ daughters of an affected male will be affected. (D) There will be equal distribution\
+  \ of males and females affected.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medicine for help. Let’s solve this step by step. Let's recall first\
+  \ that females have two X chromosomes, while males have one X and one Y chromosome.\
+  \ This is an important fact we need to know before answering this question. \nBecause\
+  \ a male can only pass his only one X chromosome to a daughter, if he is affected\
+  \ by this rare genetic disorder, then we know for sure that he will pass this rare\
+  \ genetic disorder to all his future-born daughters. Therefore, “(C): All daughters\
+  \ of an affected male will be affected” is a correct statement. The answer is (C).\n\
+  \nQ: Glucose is transported into the muscle cell:\n(A) via protein transporters\
+  \ called GLUT4. (B) only in the presence of insulin. (C) via hexokinase. (D) via\
+  \ monocarbylic acid transporters.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medicine for help. Glucose (also known as the blood sugar) is the\
+  \ main sugar found in the human body. It is transported into the muscle cell via\
+  \ diffusion through protein transporters called GLUT4. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
index d500a5b8a3f518befa5f53d6e324227f65079f15..f4135204f079639531f002bb257197078d2db52c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
@@ -1,70 +1,44 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
-
-
-  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
-  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
-  telescope is
-
-  (A) 4 (B) 5 (C) 6 (D) 20
-
-  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
-  the focus of both lenses must be between the two lenses, and thus the focal lengths
-  of the two lenses must add up to their separation. Since the focal length of one
-  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
-  the ratio of these two focal lengths, or 4. The answer is (A).
-
-
-  Q: The muon decays with a characteristic lifetime of about 10^-6 second into an
-  electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from
-  decaying into an electron and just a single neutrino by the law of conservation
-  of
-
-  (A) charge (B) mass (C) energy and momentum (D) lepton number
-
-  A: Let''s think step by step. Lepton number must be conserved, meaning the total
-  number of leptons minus the number of antileptons. If a muon decays into an electron
-  and a single neutrino, the total lepton number would go from one to two, violating
-  lepton number conservation. The answer is (D).
-
-
-  Q: One end of a Nichrome wire of length 2L and cross-sectional area A is attached
-  to an end of another Nichrome wire of length L and cross- sectional area 2A. If
-  the free end of the longer wire is at an electric potential of 8.0 volts, and the
-  free end of the shorter wire is at an electric potential of 1.0 volt, the potential
-  at the junction of the two wires is most nearly equal to
-
-  (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V
-
-  A: Let''s think step by step. This is a simple voltage divider problem, where the
-  longer wire has a resistance four times that of the shorter end. So the voltage
-  divider ratio is 1 / 5, meaning that the potential in the middle is 1.0 V + (8.0
-  V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).
-
-
-  Q: A refracting telescope consists of two converging lenses separated by 100 cm.
-  The eye-piece lens has a focal length of 20 cm. The angular magnification of the
-  telescope is
-
-  (A) 4 (B) 5 (C) 6 (D) 20
-
-  A: Let''s think step by step. In a refracting telescope, if both lenses are converging,
-  the focus of both lenses must be between the two lenses, and thus the focal lengths
-  of the two lenses must add up to their separation. Since the focal length of one
-  lens is 20 cm, the focal length of the other must be 80 cm. The magnification is
-  the ratio of these two focal lengths, or 4. The answer is (A).
-
-
-  Q: For which of the following thermodynamic processes is the increase in the internal
-  energy of an ideal gas equal to the heat added to the gas?
-
-  (A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic
-
-  A: Let''s think step by step. Heat added to the gas can go into the gases internal
-  energy or work done against an external force. However, if the volume of the gas
-  container is constant, no work will be done (since work is pressure times change
-  in volume). So, at constant volume, all of the heat goes into the internal energy.
-  The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_college_physics
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\nQ: A refracting telescope consists of two converging lenses separated\
+  \ by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification\
+  \ of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In\
+  \ a refracting telescope, if both lenses are converging, the focus of both lenses\
+  \ must be between the two lenses, and thus the focal lengths of the two lenses must\
+  \ add up to their separation. Since the focal length of one lens is 20 cm, the focal\
+  \ length of the other must be 80 cm. The magnification is the ratio of these two\
+  \ focal lengths, or 4. The answer is (A).\n\nQ: The muon decays with a characteristic\
+  \ lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron\
+  \ antineutrino. The muon is forbidden from decaying into an electron and just a\
+  \ single neutrino by the law of conservation of\n(A) charge (B) mass (C) energy\
+  \ and momentum (D) lepton number\nA: Let's think step by step. Lepton number must\
+  \ be conserved, meaning the total number of leptons minus the number of antileptons.\
+  \ If a muon decays into an electron and a single neutrino, the total lepton number\
+  \ would go from one to two, violating lepton number conservation. The answer is\
+  \ (D).\n\nQ: One end of a Nichrome wire of length 2L and cross-sectional area A\
+  \ is attached to an end of another Nichrome wire of length L and cross- sectional\
+  \ area 2A. If the free end of the longer wire is at an electric potential of 8.0\
+  \ volts, and the free end of the shorter wire is at an electric potential of 1.0\
+  \ volt, the potential at the junction of the two wires is most nearly equal to\n\
+  (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V\nA: Let's think step by step. This is a\
+  \ simple voltage divider problem, where the longer wire has a resistance four times\
+  \ that of the shorter end. So the voltage divider ratio is 1 / 5, meaning that the\
+  \ potential in the middle is 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is\
+  \ (A).\n\nQ: A refracting telescope consists of two converging lenses separated\
+  \ by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification\
+  \ of the telescope is\n(A) 4 (B) 5 (C) 6 (D) 20\nA: Let's think step by step. In\
+  \ a refracting telescope, if both lenses are converging, the focus of both lenses\
+  \ must be between the two lenses, and thus the focal lengths of the two lenses must\
+  \ add up to their separation. Since the focal length of one lens is 20 cm, the focal\
+  \ length of the other must be 80 cm. The magnification is the ratio of these two\
+  \ focal lengths, or 4. The answer is (A).\n\nQ: For which of the following thermodynamic\
+  \ processes is the increase in the internal energy of an ideal gas equal to the\
+  \ heat added to the gas?\n(A) Constant temperature (B) Constant volume (C) Constant\
+  \ pressure (D) Adiabatic\nA: Let's think step by step. Heat added to the gas can\
+  \ go into the gases internal energy or work done against an external force. However,\
+  \ if the volume of the gas container is constant, no work will be done (since work\
+  \ is pressure times change in volume). So, at constant volume, all of the heat goes\
+  \ into the internal energy. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
index 08f23169d38b45b0143ec5467a86cf18e9fa5771..11d2736adf789feb1b6ecb614426f2dce099455d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
@@ -1,35 +1,36 @@
-dataset_name: computer_security
-description: "The following are multiple choice questions (with answers) about computer\
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\nQ: SHA-1 has a message digest of\n(A) 160 bits (B) 512 bits (C) 628\
   \ bits (D) 820 bits\nA: Let's think step by step. Since SHA-1 is a hash function\
   \ which takes an input and produces a 160-bit (20-byte) hash value, its message\
   \ digest is 160 bits. The answer is (A).\n\nQ: _____________ can modify data on\
-  \ your system \u2013 so that your system doesn\u2019t run correctly or you can no\
-  \ longer access specific data, or it may even ask for ransom in order to give your\
-  \ access.\n(A) IM \u2013 Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D)\
-  \ Ransom Trojan\nA: Let's think step by step. The system is asking for trojans,\
-  \ which are for ransom, which means ransom trojan. The answer is (D).\n\nQ: What\
-  \ is ethical hacking?\n(A) \"Hacking\" ethics so they justify unintended selfish\
-  \ behavior (B) Hacking systems (e.g., during penetration testing) to expose vulnerabilities\
-  \ so they can be fixed, rather than exploited (C) Hacking into systems run by those\
-  \ whose ethics you disagree with (D) A slang term for rapid software development,\
-  \ e.g., as part of hackathons\nA: Let's think step by step. Ethical hacking is a\
-  \ process of detecting vulnerabilities in an application, system, or organization's\
-  \ infrastructure that an attacker can use to exploit an individual or organization.\
-  \ They use this process to prevent cyberattacks and security breaches by lawfully\
-  \ hacking into the systems and looking for weak points. The answer is (B).\n\nQ:\
-  \ The ____________ is anything which your search engine cannot search.\n(A) Haunted\
-  \ web (B) World Wide Web (C) Surface web (D) Deep Web\nA: Let's think step by step.\
-  \ The search engine searches on the Surface Web, which is the portion of the world\
-  \ wide web which is visible so (B,C) are wrong. The Haunted Web doesn\u2019t correspond\
-  \ to an internet concept. The Deep Web is the part of the World Wide Web which is\
-  \ not indexed. The answer is (D).\n\nQ: Exploitation of the Heartbleed bug permits\n\
-  (A) overwriting cryptographic keys in memory (B) a kind of code injection (C) a\
-  \ read outside bounds of a buffer (D) a format string attack\nA: Let's think step\
-  \ by step. The Heartbleed Bug is a serious vulnerability in the popular OpenSSL\
-  \ cryptographic software library. Heartbleed resulted from improper input validation\
-  \ (due to a missing bounds check) in the implementation of the TLS heartbeat extension.\
-  \ The vulnerability was classified as a buffer over-read, a situation where more\
-  \ data can be read than should be allowed. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_computer_security
+  \ your system – so that your system doesn’t run correctly or you can no longer access\
+  \ specific data, or it may even ask for ransom in order to give your access.\n(A)\
+  \ IM – Trojans (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan\nA:\
+  \ Let's think step by step. The system is asking for trojans, which are for ransom,\
+  \ which means ransom trojan. The answer is (D).\n\nQ: What is ethical hacking?\n\
+  (A) \"Hacking\" ethics so they justify unintended selfish behavior (B) Hacking systems\
+  \ (e.g., during penetration testing) to expose vulnerabilities so they can be fixed,\
+  \ rather than exploited (C) Hacking into systems run by those whose ethics you disagree\
+  \ with (D) A slang term for rapid software development, e.g., as part of hackathons\n\
+  A: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities\
+  \ in an application, system, or organization's infrastructure that an attacker can\
+  \ use to exploit an individual or organization. They use this process to prevent\
+  \ cyberattacks and security breaches by lawfully hacking into the systems and looking\
+  \ for weak points. The answer is (B).\n\nQ: The ____________ is anything which your\
+  \ search engine cannot search.\n(A) Haunted web (B) World Wide Web (C) Surface web\
+  \ (D) Deep Web\nA: Let's think step by step. The search engine searches on the Surface\
+  \ Web, which is the portion of the world wide web which is visible so (B,C) are\
+  \ wrong. The Haunted Web doesn’t correspond to an internet concept. The Deep Web\
+  \ is the part of the World Wide Web which is not indexed. The answer is (D).\n\n\
+  Q: Exploitation of the Heartbleed bug permits\n(A) overwriting cryptographic keys\
+  \ in memory (B) a kind of code injection (C) a read outside bounds of a buffer (D)\
+  \ a format string attack\nA: Let's think step by step. The Heartbleed Bug is a serious\
+  \ vulnerability in the popular OpenSSL cryptographic software library. Heartbleed\
+  \ resulted from improper input validation (due to a missing bounds check) in the\
+  \ implementation of the TLS heartbeat extension. The vulnerability was classified\
+  \ as a buffer over-read, a situation where more data can be read than should be\
+  \ allowed. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
index df845ce8b2821fd05d04b356214308c558618405..4866041d1e38a17958fda14c6ebdbe6223ad1105 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
@@ -1,32 +1,33 @@
-dataset_name: conceptual_physics
-description: "\nThe following are multiple choice questions (with answers) about conceptual\
-  \ physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted to a different\
-  \ frequency (B) deflection (C) interference (D) polarization\nA: Let's think step\
-  \ by step. In a soap bubble film, the light bounces between the two soap-air interfaces\
-  \ many times, interfering with itself constructively or destructively depending\
-  \ on the width of the film. This results in different colors being visible. The\
-  \ answer is (C).\n\nQ: Compared with the mass of a uranium atom undergoing fission,\
-  \ the combined masses of the products after fission are\n(A) less (B) more (C) the\
-  \ same (D) zero\nA: Let's think step by step. Fission releases energy, which comes\
-  \ from the rest mass of its initial nucleus. Thus the mass of the products is less\
-  \ than the mass of the reactant uranium nucleus. The answer is (A).\n\nQ: Things\
-  \ that are equivalent according to the equivalence principle are\n(A) space and\
-  \ time. (B) a traveling twin and a stay-at-home twin. (C) gravity and acceleration.\
-  \ (D) mass and energy.\nA: Let's think step by step. Einstein\u2019s famous equivalence\
-  \ principle states that gravity and acceleration are equivalent. The answer is (C).\n\
-  \nQ: Which of these three elements has the most mass per nucleon?\n(A) Hydrogen\
-  \ (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step. Due to nuclear\
-  \ binding energy, the mass of an atomic nucleus is less than the sum of individual\
-  \ masses of the free constituent protons and neutrons; this is known as the mass\
-  \ defect. Hydrogen has no mass defect because it has only a single nucleon, so it\
-  \ will have the most mass per nucleon. The answer is (A).\n\nQ: A model airplane\
-  \ flies slower when flying into the wind and faster with wind at its back. When\
-  \ launched at right angles to the wind a cross wind its groundspeed compared with\
-  \ flying in still air is\n(A) the same (B) greater (C) less (D) either greater or\
-  \ less depending on wind speed\nA: Let's think step by step. The plane\u2019s speed\
-  \ in the direction of the wind is greater than it would be in the absence of wind,\
-  \ and its direction orthogonal to the wind is the same as it would be in the absence\
-  \ of the wind. The total speed, which is these two components added in quadrature,\
-  \ is thus greater than the speed in still air. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_conceptual_physics
+"dataset_name": "conceptual_physics"
+"description": "\nThe following are multiple choice questions (with answers) about\
+  \ conceptual physics.\n\nQ: Colors in a soap bubble result from light\n(A) converted\
+  \ to a different frequency (B) deflection (C) interference (D) polarization\nA:\
+  \ Let's think step by step. In a soap bubble film, the light bounces between the\
+  \ two soap-air interfaces many times, interfering with itself constructively or\
+  \ destructively depending on the width of the film. This results in different colors\
+  \ being visible. The answer is (C).\n\nQ: Compared with the mass of a uranium atom\
+  \ undergoing fission, the combined masses of the products after fission are\n(A)\
+  \ less (B) more (C) the same (D) zero\nA: Let's think step by step. Fission releases\
+  \ energy, which comes from the rest mass of its initial nucleus. Thus the mass of\
+  \ the products is less than the mass of the reactant uranium nucleus. The answer\
+  \ is (A).\n\nQ: Things that are equivalent according to the equivalence principle\
+  \ are\n(A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity\
+  \ and acceleration. (D) mass and energy.\nA: Let's think step by step. Einstein’s\
+  \ famous equivalence principle states that gravity and acceleration are equivalent.\
+  \ The answer is (C).\n\nQ: Which of these three elements has the most mass per nucleon?\n\
+  (A) Hydrogen (B) Iron (C) Uranium (D) Same in each\nA: Let's think step by step.\
+  \ Due to nuclear binding energy, the mass of an atomic nucleus is less than the\
+  \ sum of individual masses of the free constituent protons and neutrons; this is\
+  \ known as the mass defect. Hydrogen has no mass defect because it has only a single\
+  \ nucleon, so it will have the most mass per nucleon. The answer is (A).\n\nQ: A\
+  \ model airplane flies slower when flying into the wind and faster with wind at\
+  \ its back. When launched at right angles to the wind a cross wind its groundspeed\
+  \ compared with flying in still air is\n(A) the same (B) greater (C) less (D) either\
+  \ greater or less depending on wind speed\nA: Let's think step by step. The plane’s\
+  \ speed in the direction of the wind is greater than it would be in the absence\
+  \ of wind, and its direction orthogonal to the wind is the same as it would be in\
+  \ the absence of the wind. The total speed, which is these two components added\
+  \ in quadrature, is thus greater than the speed in still air. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
index 33883f474c12ec1a8d641cf8f7489284ab65ed18..c97ae1b25cac9d98c7084fad0ab96fb234252c23 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
@@ -1,63 +1,63 @@
-dataset_name: econometrics
-description: "The following are multiple choice questions (with answers) about econometrics.\n\
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \nQ: Suppose now that a researcher wishes to use information criteria to determine\
   \ the optimal lag length for a VAR. 500 observations are available for the bi-variate\
   \ VAR, and the values of the determinant of the variance-covariance matrix of residuals\
   \ are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively. What\
   \ is the optimal model order according to Akaike's information criterion?\n(A) 1\
   \ lag (B) 2 lags (C) 3 lags (D) 4 lags\nA: Let's think step by step. We refer to\
-  \ Wikipedia articles on econometrics for help. Let\u2019s solve this problem step\
-  \ by step. First of all, let\u2019s recall that for a given set of data, Akaike's\
-  \ information criterion (AIC) allows us to measure how well a statistical model\
-  \ fits the data; it is an estimator of prediction error. Here in this problem we\
-  \ will need to use the formula ln(det(sigma_hat)) + (2 * k / T) to determine the\
-  \ values of Akaike\u2019s criterion, where ln denotes the natural log function,\
-  \ det the determinant function, k the total number of parameters in total (across\
-  \ both equations), and T the number of observations (which, in this case, is equal\
-  \ to 500). For 1 lag, the number of parameters in total is equal to 6; for 2 lags,\
-  \ it is 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate\
-  \ the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500)\
-  \ = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) =\
-  \ ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084)\
-  \ + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36\
-  \ / 500) =-5.011\nBecause the optimal model order according to AIC minimizes the\
-  \ information criterion, the answer should be the one with the lowest value. In\
-  \ this case, (D) has the lowest value. The answer is (C).\n\nQ: Consider the following\
-  \ AR(1) model with the disturbances having zero mean and unit variance\nyt = 0.2\
-  \ + 0.4 yt-1 + ut\nThe (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4\
-  \ (C) 0.5 (D) 0.33\nA: Let's think step by step. We refer to Wikipedia articles\
-  \ on econometrics for help. Let\u2019s solve this problem step by step. If we have\
-  \ a an AR(1) model with the disturbances having zero mean and unit variance, then\
-  \ the unconditional mean of y is equal to the following:\nunconditional mean of\
-  \ y = (the intercept term) / (1 - autoregressive coefficient)\nWe know that the\
-  \ intercept term is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\n\
-  unconditional mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which\
-  \ is approximately 0.33. That means that the answer should be (D) 0.33. The answer\
-  \ is (D).\n\nQ: What would be then consequences for the OLS estimator if heteroscedasticity\
-  \ is present in a regression model but ignored?\n(A) It will be biased (B) It will\
-  \ be inconsistent (C) It will be inefficient (D) All of (a), (b) and (c) will be\
-  \ true.\nA: Let's think step by step. We refer to Wikipedia articles on econometrics\
-  \ for help. Heteroscedasticity refers to the condition where the variance of the\
-  \ error terms is not constant across multiple observations. If heteroscedasticity\
-  \ is present in a regression model, then the coefficient estimates in the OLS estimator\
-  \ will be not only unbiased and consistent but also inefficient. Because (A) and\
-  \ (B) are incorrect choices and (C) is a correct choice, (D) cannot be the right\
-  \ answer. Ultimately, (C) is the only true choice. The answer is (C).\n\nQ: Suppose\
-  \ that a test statistic has associated with it a p-value of 0.08. Which one of the\
-  \ following statements is true?\n(i) If the size of the test were exactly 8%, we\
-  \ would be indifferent between rejecting and not rejecting the null hypothesis\n\
-  (ii) The null would be rejected if a 10% size of test were used\n(iii) The null\
-  \ would not be rejected if a 1% size of test were used\n(iv) The null would be rejected\
-  \ if a 5% size of test were used.\n(A) (ii) and (iv) only (B) (i) and (iii) only\
-  \ (C) (i), (ii), and (iii) only (D) (i), (ii), (iii), and (iv).\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on econometrics for help. Let\u2019\
-  s reason about each of the options.\n(i) is a true statement.\n(ii) is a true statement.\n\
-  (iii) is a true statement.\n(iv) is not a true statement. Thus, (i), (ii), and (iii)\
-  \ are true. The answer is (C).\n\nQ: For a stationary autoregressive process, shocks\
-  \ will\n(A) Eventually die away (B) Persist indefinitely (C) Grow exponentially\
-  \ (D) Never occur\nA: Let's think step by step. We refer to Wikipedia articles on\
-  \ econometrics for help. This is a formal logic problem about stationally process.\
-  \ For a stationary autoregressive process, shocks will eventually die away. The\
-  \ answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_econometrics
+  \ Wikipedia articles on econometrics for help. Let’s solve this problem step by\
+  \ step. First of all, let’s recall that for a given set of data, Akaike's information\
+  \ criterion (AIC) allows us to measure how well a statistical model fits the data;\
+  \ it is an estimator of prediction error. Here in this problem we will need to use\
+  \ the formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike’s\
+  \ criterion, where ln denotes the natural log function, det the determinant function,\
+  \ k the total number of parameters in total (across both equations), and T the number\
+  \ of observations (which, in this case, is equal to 500). For 1 lag, the number\
+  \ of parameters in total is equal to 6; for 2 lags, it is 10; for 3 lags, it is\
+  \ 14; and for 4 lags, it is 18. Now, let’s calculate the values of the criterion\
+  \ for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6 / 500) = ln(0.0336) + (12 / 500)\
+  \ = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10 / 500) = ln(0.0169) + (20 / 500) =\
+  \ -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14 / 500) = ln(0.0084) + (28 / 500) =-4.724\n\
+  (D) 4 lags: ln(0.0062) + (2 * 18 / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause\
+  \ the optimal model order according to AIC minimizes the information criterion,\
+  \ the answer should be the one with the lowest value. In this case, (D) has the\
+  \ lowest value. The answer is (C).\n\nQ: Consider the following AR(1) model with\
+  \ the disturbances having zero mean and unit variance\nyt = 0.2 + 0.4 yt-1 + ut\n\
+  The (unconditional) mean of y will be given by\n(A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33\n\
+  A: Let's think step by step. We refer to Wikipedia articles on econometrics for\
+  \ help. Let’s solve this problem step by step. If we have a an AR(1) model with\
+  \ the disturbances having zero mean and unit variance, then the unconditional mean\
+  \ of y is equal to the following:\nunconditional mean of y = (the intercept term)\
+  \ / (1 - autoregressive coefficient)\nWe know that the intercept term is 0.2 and\
+  \ the autoregressive coefficient is 0.4; thus, we have:\nunconditional mean of y\
+  \ = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately 0.33.\
+  \ That means that the answer should be (D) 0.33. The answer is (D).\n\nQ: What would\
+  \ be then consequences for the OLS estimator if heteroscedasticity is present in\
+  \ a regression model but ignored?\n(A) It will be biased (B) It will be inconsistent\
+  \ (C) It will be inefficient (D) All of (a), (b) and (c) will be true.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on econometrics for help. Heteroscedasticity\
+  \ refers to the condition where the variance of the error terms is not constant\
+  \ across multiple observations. If heteroscedasticity is present in a regression\
+  \ model, then the coefficient estimates in the OLS estimator will be not only unbiased\
+  \ and consistent but also inefficient. Because (A) and (B) are incorrect choices\
+  \ and (C) is a correct choice, (D) cannot be the right answer. Ultimately, (C) is\
+  \ the only true choice. The answer is (C).\n\nQ: Suppose that a test statistic has\
+  \ associated with it a p-value of 0.08. Which one of the following statements is\
+  \ true?\n(i) If the size of the test were exactly 8%, we would be indifferent between\
+  \ rejecting and not rejecting the null hypothesis\n(ii) The null would be rejected\
+  \ if a 10% size of test were used\n(iii) The null would not be rejected if a 1%\
+  \ size of test were used\n(iv) The null would be rejected if a 5% size of test were\
+  \ used.\n(A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii)\
+  \ only (D) (i), (ii), (iii), and (iv).\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on econometrics for help. Let’s reason about each of the options.\n\
+  (i) is a true statement.\n(ii) is a true statement.\n(iii) is a true statement.\n\
+  (iv) is not a true statement. Thus, (i), (ii), and (iii) are true. The answer is\
+  \ (C).\n\nQ: For a stationary autoregressive process, shocks will\n(A) Eventually\
+  \ die away (B) Persist indefinitely (C) Grow exponentially (D) Never occur\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on econometrics for help. This\
+  \ is a formal logic problem about stationally process. For a stationary autoregressive\
+  \ process, shocks will eventually die away. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
index cdd31ce4c13ca07b1c54d8b80e4ee30fef52e1cc..ea7b24a077bed8873a2a6337d68ddf2e1577f188 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
@@ -1,34 +1,34 @@
-dataset_name: electrical_engineering
-description: "\nThe following are multiple choice questions (with answers) about electrical\
-  \ engineering.\n\nQ: A point pole has a strength of 4\u03C0 * 10^-4 weber. The force\
-  \ in newtons on a point pole of 4\u03C0 * 1.5 * 10^-4 weber placed at a distance\
+"dataset_name": "electrical_engineering"
+"description": "\nThe following are multiple choice questions (with answers) about\
+  \ electrical engineering.\n\nQ: A point pole has a strength of 4π * 10^-4 weber.\
+  \ The force in newtons on a point pole of 4π * 1.5 * 10^-4 weber placed at a distance\
   \ of 10 cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N.\nA: Let's\
   \ think step by step. The force between two point poles is given by m_1m_2/(mu_0\
-  \ 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in the values given in\
-  \ the question, we calculate that the force is approximately 15 N. The answer is\
-  \ (A).\n\nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30\
-  \ mm wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density\
-  \ is 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's\
-  \ think step by step. The torque on a coil in a uniform magnetic field is given\
-  \ by BANI, where B is the magnetic flux density, A is the area of the coil, N is\
-  \ the number of turns, and I is the current. So we have that I = (Torque)/(BAN),\
-  \ or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch\
-  \ built from NOR gates, which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1\
-  \ (C) S=1, R=0 (D) S=1, R=1\nA: Let's think step by step. An SR latch is a set-reset\
-  \ latch; in the case where S=1 and R=1, the circuit has no stable state; instead\
-  \ a race condition will be produced within the circuit, so the device will be in\
-  \ an undefined state. So S=1, R=1 is an illegal input. The answer is (D).\n\nQ:\
-  \ Two long parallel conductors carry 100 A. If the conductors are separated by 20\
-  \ mm, the force per meter of length of each conductor will be\n(A) 100 N. (B) 0.1\
-  \ N. (C) 1 N. (D) 0.01 N.\nA: Let's think step by step. The magnetic force-per-length\
-  \ between two current-carrying conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r),\
-  \ where $r$ is the separation distance and I_1 and I_2 are the currents. Plugging\
-  \ in 100 A for I_1 and I_2, and 20 mm for r, gives 0.1 N. The answer is (B).\n\n\
-  Q: In a 2 pole lap winding dc machine , the resistance of one conductor is 2\u03A9\
-  \ and total number of conductors is 100. Find the total resistance\n(A) 200\u03A9\
-  \ (B) 100\u03A9 (C) 50\u03A9 (D) 10\u03A9\nA: Let's think step by step. In lap winding,\
-  \ effectively two resistors are connected in parallel, so the actual resistance\
-  \ of each pair is 1 Ohm. Since we have 50 pairs, we get a total resistance of 50\
-  \ Ohms. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_electrical_engineering
+  \ 4 \\pi r^2), in analogy to Coulomb’s law. Plugging in the values given in the\
+  \ question, we calculate that the force is approximately 15 N. The answer is (A).\n\
+  \nQ: The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm wide.\
+  \ The control torque is 240*10-6 N-m on full scale. If magnetic flux density is\
+  \ 1Wb/m2 range of meter is\n(A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.\nA: Let's think\
+  \ step by step. The torque on a coil in a uniform magnetic field is given by BANI,\
+  \ where B is the magnetic flux density, A is the area of the coil, N is the number\
+  \ of turns, and I is the current. So we have that I = (Torque)/(BAN), or 240e-6/(1200e-6\
+  \ * 100 * 1) = 2e-3. The answer is (B).\n\nQ: In an SR latch built from NOR gates,\
+  \ which condition is not allowed\n(A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1,\
+  \ R=1\nA: Let's think step by step. An SR latch is a set-reset latch; in the case\
+  \ where S=1 and R=1, the circuit has no stable state; instead a race condition will\
+  \ be produced within the circuit, so the device will be in an undefined state. So\
+  \ S=1, R=1 is an illegal input. The answer is (D).\n\nQ: Two long parallel conductors\
+  \ carry 100 A. If the conductors are separated by 20 mm, the force per meter of\
+  \ length of each conductor will be\n(A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.\n\
+  A: Let's think step by step. The magnetic force-per-length between two current-carrying\
+  \ conductors is given by \\mu_0 I_1 I_2 / (2 \\pi r), where $r$ is the separation\
+  \ distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2,\
+  \ and 20 mm for r, gives 0.1 N. The answer is (B).\n\nQ: In a 2 pole lap winding\
+  \ dc machine , the resistance of one conductor is 2Ω and total number of conductors\
+  \ is 100. Find the total resistance\n(A) 200Ω (B) 100Ω (C) 50Ω (D) 10Ω\nA: Let's\
+  \ think step by step. In lap winding, effectively two resistors are connected in\
+  \ parallel, so the actual resistance of each pair is 1 Ohm. Since we have 50 pairs,\
+  \ we get a total resistance of 50 Ohms. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
index a85f799ca4f21f98e70511fd36c20ae29b9e4f54..b5567c0ccee8240e93ad1e82a06d12cf1fd836cc 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
@@ -1,5 +1,5 @@
-dataset_name: elementary_mathematics
-description: "The following are multiple choice questions (with answers) about elementary\
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\nQ: Olivia used the rule \"Add 11\" to create the number pattern\
   \ shown below. 10, 21, 32, 43, 54. Which statement about the number pattern is true?\n\
   (A) The 10th number in the pattern will be an even number.\n(B) The number pattern\
@@ -22,19 +22,20 @@ description: "The following are multiple choice questions (with answers) about e
   \ the other choices are incorrect. The answer is (A).\n\nQ: A store sells 107 different\
   \ colors of paint. They have 25 cans of each color in storage. The number of cans\
   \ of paint the store has in storage can be found using the expression below. 107\
-  \ \xD7 25. How many cans of paint does the store have in storage?\n(A) 749\n(B)\
-  \ 2,675\n(C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107\
-  \ x 25 = (100 x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total\
-  \ of 30 players will play basketball at a park. There will be exactly 5 players\
-  \ on each team. Which statement correctly explains how to find the number of teams\
-  \ needed?\n(A) Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n\
-  (C) Multiply 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\n\
-  A: Let's think step by step. We want to find the number of teams. We know that there\
-  \ are 5 players/team, and 30 players. Thus to get the number of teams we divide\
-  \ players by players/team, so 30 players / 5 players/team = 6 teams. The answer\
-  \ is (B).\n\nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n\
-  (B) (5 x 5) + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think\
-  \ step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5\
-  \ x 4). The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_elementary_mathematics
+  \ × 25. How many cans of paint does the store have in storage?\n(A) 749\n(B) 2,675\n\
+  (C) 2,945\n(D) 4,250\nA: Let's think step by step. We can calculate 107 x 25 = (100\
+  \ x 25) + (7 x 25) = 2500 + 175 = 2675. The answer is (B).\n\nQ: A total of 30 players\
+  \ will play basketball at a park. There will be exactly 5 players on each team.\
+  \ Which statement correctly explains how to find the number of teams needed?\n(A)\
+  \ Add 5 to 30 to find 35 teams.\n(B) Divide 30 by 5 to find 6 teams.\n(C) Multiply\
+  \ 30 and 5 to find 150 teams.\n(D) Subtract 5 from 30 to find 25 teams.\nA: Let's\
+  \ think step by step. We want to find the number of teams. We know that there are\
+  \ 5 players/team, and 30 players. Thus to get the number of teams we divide players\
+  \ by players/team, so 30 players / 5 players/team = 6 teams. The answer is (B).\n\
+  \nQ: Which expression is equivalent to 5 x 9?\n(A) (5 x 4) x (6 x 5)\n(B) (5 x 5)\
+  \ + (5 x 4)\n(C) (5 x 5) + (5 x 9)\n(D) (5 x 9) x (6 x 9)\nA: Let's think step by\
+  \ step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + 4) = (5 x 5) + (5 x 4). The\
+  \ answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
index 5de7486cc1671e7e4e3df862c43f0e8319907385..40adf4657d26a1babf044fda6402c4c657302e87 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
@@ -1,57 +1,53 @@
-dataset_name: formal_logic
-description: "The following are multiple choice questions (with answers) about formal\
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\nQ: Which of the given formulas of PL is the best symbolization of the\
   \ following sentence?\nTurtles live long lives and are happy creatures, unless they\
-  \ are injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022\
-  \ (H \u2228 I) (D) L \u2022 (H \u2283 R).\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on formal logic for help. Let\u2019s solve this step by\
-  \ step. Let \u201CL\u201D denote \u201Cliving long\u201D, H \u201Cbeing happy\u201D\
-  , and \u201CI\u201D \u201Cbeing injured\u201D. Now, consider each choice:\n(A) means\
-  \ (living long AND being happy) is equivalent to (being injured). \n(B) means (living\
-  \ long AND being happy) OR (being injured). \n(C) means (living long) AND (being\
-  \ happy OR being injured). \n(D) means (living long) AND (being happy implies being\
-  \ R), but what R denotes is not clear.\nObviously, (B) is the best symbolization\
-  \ of the original sentence. The answer is (B).\n\nQ: Select the best translation\
-  \ into predicate logic.George borrows Hector's lawnmower. (g: George; h: Hector;\
-  \ l: Hector's lawnmower; Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh\
-  \ (D) Bghl\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. We are told that \u201CBxyx\u201D\
-  \ means \u201Cx borrows y from z\u201D. We can rewrite \u201CGeorge borrows Hector's\
-  \ lawnmower\u201D as \u201CGeorge borrows a lawnmower from Hector\u201D, which can\
-  \ then be translated into predicate logic as \u201CBglh\u201D. The answer \u201C\
-  Bglh\u201D appears in (C); therefore, (C) must be the correct answer. The answer\
-  \ is (C).\n\nQ: \nSelect the best English interpretation of the given arguments\
-  \ in predicate logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/\
-  \ (\u2203x)Ax\n(A) Marina is a dancer. Some weaklings are not dancers. Either everything\
-  \ is a weakling or Georgia plays volleyball. So something plays volleyball. (B)\
-  \ Marina is a dancer. No weakling is a dancer. Everything is either a weakling or\
-  \ plays volleyball. So something plays volleyball. (C) Marina is a dancer. Some\
-  \ weaklings are not dancers. Everything is either a weakling or plays volleyball.\
-  \ So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer.\
-  \ Either everything is a weakling or Georgia plays volleyball. So something plays\
-  \ volleyball.\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. Let \u201CD\u201D denote \u201C\
-  being a dancer\u201D, \u201Cm\u201D denote \u201CMaria\u201D, \u201Cg\u201D denote\
-  \ \u201CGeorgia\u201D, \u201CW\u201D denote \u201Cweakling\u201D, \u201CA\u201D\
-  \ denote \u201Cplaying volleyball\u201D. Then, we have the following:\n1. Dm \u2192\
-  \ Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all x, if x is a weakling,\
-  \ then x is not a dancer. In other words, no weakling is a dancer.\n3. (\u2200x)Wx\
-  \ \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x is a weakling or Georgia plays volleyball.\
-  \ So there exists an x that plays volleyball. \nOptions (A) and (C) do claim that\
-  \ some weaklings are not dancers, but the second argument strongly states that no\
-  \ weakling is a dancer. Thus, we can eliminate them. Option (B) omits the important\
-  \ detail about Georgia playing volleyball. Option (D) has all the details presented\
-  \ in the arguments and is the best English interpretation of the arguments. The\
-  \ answer is (D).\n\nQ: Select the best translation into predicate logic: No people\
-  \ drive on Mars.\n(A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx)\
-  \ (D) ~Dp\nA: Let's think step by step. We refer to Wikipedia articles on formal\
-  \ logic for help. Let\u2019s solve this step by step. Let \u201CP\u201D denote \u201C\
-  being on Mars\u201D and \u201CD\u201D denote \u201Cdriving on Mars\u201D. Then let\u2019\
-  s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\nOption (B): (\u2200\
-  x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do not drive on Mars.\nOption\
-  \ (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x, x is on Mars implies that x do\
-  \ not drive on Mars.\nOption (D): ~Dp: \u2192 p do not drive on Mars.\nOf all these\
-  \ options, Option (C) appears to be the best and most meaningful interpretation\
-  \ of the argument \u201CNo people drive on Mars.\u201D The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_formal_logic
+  \ are injured.\n(A) (L • H) ≡ I (B) (L • H) ∨ I (C) L • (H ∨ I) (D) L • (H ⊃ R).\n\
+  A: Let's think step by step. We refer to Wikipedia articles on formal logic for\
+  \ help. Let’s solve this step by step. Let “L” denote “living long”, H “being happy”,\
+  \ and “I” “being injured”. Now, consider each choice:\n(A) means (living long AND\
+  \ being happy) is equivalent to (being injured). \n(B) means (living long AND being\
+  \ happy) OR (being injured). \n(C) means (living long) AND (being happy OR being\
+  \ injured). \n(D) means (living long) AND (being happy implies being R), but what\
+  \ R denotes is not clear.\nObviously, (B) is the best symbolization of the original\
+  \ sentence. The answer is (B).\n\nQ: Select the best translation into predicate\
+  \ logic.George borrows Hector's lawnmower. (g: George; h: Hector; l: Hector's lawnmower;\
+  \ Bxyx: x borrows y from z).\n(A) Blgh (B) Bhlg (C) Bglh (D) Bghl\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on formal logic for help. Let’s solve\
+  \ this step by step. We are told that “Bxyx” means “x borrows y from z”. We can\
+  \ rewrite “George borrows Hector's lawnmower” as “George borrows a lawnmower from\
+  \ Hector”, which can then be translated into predicate logic as “Bglh”. The answer\
+  \ “Bglh” appears in (C); therefore, (C) must be the correct answer. The answer is\
+  \ (C).\n\nQ: \nSelect the best English interpretation of the given arguments in\
+  \ predicate logic.\nDm\n(∀x)(Wx ⊃ ~Dx). \n(∀x)Wx ∨ Ag\t/ (∃x)Ax\n(A) Marina is a\
+  \ dancer. Some weaklings are not dancers. Either everything is a weakling or Georgia\
+  \ plays volleyball. So something plays volleyball. (B) Marina is a dancer. No weakling\
+  \ is a dancer. Everything is either a weakling or plays volleyball. So something\
+  \ plays volleyball. (C) Marina is a dancer. Some weaklings are not dancers. Everything\
+  \ is either a weakling or plays volleyball. So something plays volleyball. (D) Marina\
+  \ is a dancer. No weakling is a dancer. Either everything is a weakling or Georgia\
+  \ plays volleyball. So something plays volleyball.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on formal logic for help. Let’s solve this step\
+  \ by step. Let “D” denote “being a dancer”, “m” denote “Maria”, “g” denote “Georgia”,\
+  \ “W” denote “weakling”, “A” denote “playing volleyball”. Then, we have the following:\n\
+  1. Dm → Maria is a dance.\n2. (∀x)(Wx ⊃ ~Dx). → For all x, if x is a weakling, then\
+  \ x is not a dancer. In other words, no weakling is a dancer.\n3. (∀x)Wx ∨ Ag\t\
+  / (∃x)Ax → For all x, x is a weakling or Georgia plays volleyball. So there exists\
+  \ an x that plays volleyball. \nOptions (A) and (C) do claim that some weaklings\
+  \ are not dancers, but the second argument strongly states that no weakling is a\
+  \ dancer. Thus, we can eliminate them. Option (B) omits the important detail about\
+  \ Georgia playing volleyball. Option (D) has all the details presented in the arguments\
+  \ and is the best English interpretation of the arguments. The answer is (D).\n\n\
+  Q: Select the best translation into predicate logic: No people drive on Mars.\n\
+  (A) ~Pd (B) (∀x)(Px ∨ ~Dx) (C) (∀x)(Px ⊃ ~Dx) (D) ~Dp\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on formal logic for help. Let’s solve this step\
+  \ by step. Let “P” denote “being on Mars” and “D” denote “driving on Mars”. Then\
+  \ let’s consider each option:\nOption (A): ~Pd → d is not on Mars.\nOption (B):\
+  \ (∀x)(Px ∨ ~Dx) → For all x, x is on Mars and x do not drive on Mars.\nOption (C):\
+  \ (∀x)(Px ⊃ ~Dx) → For all x, x is on Mars implies that x do not drive on Mars.\n\
+  Option (D): ~Dp: → p do not drive on Mars.\nOf all these options, Option (C) appears\
+  \ to be the best and most meaningful interpretation of the argument “No people drive\
+  \ on Mars.” The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
index d5031b00f4601fd70999c124457ba10a7992c17a..0287df115e7312aa88a128c5900347c08f21e669 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
@@ -1,33 +1,34 @@
-dataset_name: global_facts
-description: "The following are multiple choice questions (with answers) about global\
-  \ facts.\n\nQ: As of 2017, how many of the world\u2019s 1-year-old children today\
-  \ have been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\n\
-  A: Let's think step by step. We refer to Wikipedia articles on global facts for\
-  \ help. According to data published by the World Health Organization, the nummber\
-  \ of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is (A).\n\n\
-  Q: As of 2019, about what percentage of Americans agree that the state is run for\
-  \ the benefit of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on global facts for help. In 2019,\
-  \ about 46% percentage of Americans agree that the state is run for the benefit\
-  \ of all the people. The answer is (B).\n\nQ: As of 2019, about what percentage\
-  \ of Russians say it is very important to have free media in our country without\
-  \ government/state censorship?\n(A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think\
-  \ step by step. We refer to Wikipedia articles on global facts for help. As of 2019,\
-  \ about 38% of Russians say it is very important to have free media in our country.\
-  \ The answer is (A).\n\nQ: As of 2015, since 1990 forests have ____ in Europe and\
-  \ have ____ in Africa and the Americas.\n(A) increased, increased (B) increased,\
-  \ decreased (C) decreased, increased (D) decreased, decreased\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on global facts for help. As of 2015,\
-  \ since 1990 forests have increased in Europe and have decreased in Africa and the\
-  \ Americas. The answer is (B).\n\nQ: Which of the following pairs of statements\
-  \ are both true (as of 2019)?\n(A) People tend to be optimistic about their own\
-  \ future and the future of their nation or the world. (B) People tend to be optimistic\
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\nQ: As of 2017, how many of the world’s 1-year-old children today have\
+  \ been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on global facts for help. According\
+  \ to data published by the World Health Organization, the nummber of 1-year-old\
+  \ children vaccinated in 2017 exceeds 80%. The answer is (A).\n\nQ: As of 2019,\
+  \ about what percentage of Americans agree that the state is run for the benefit\
+  \ of all the people?\n(A) 31% (B) 46% (C) 61% (D) 76%\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on global facts for help. In 2019, about 46% percentage\
+  \ of Americans agree that the state is run for the benefit of all the people. The\
+  \ answer is (B).\n\nQ: As of 2019, about what percentage of Russians say it is very\
+  \ important to have free media in our country without government/state censorship?\n\
+  (A) 38% (B) 53% (C) 68% (D) 83%\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on global facts for help. As of 2019, about 38% of Russians say it is\
+  \ very important to have free media in our country. The answer is (A).\n\nQ: As\
+  \ of 2015, since 1990 forests have ____ in Europe and have ____ in Africa and the\
+  \ Americas.\n(A) increased, increased (B) increased, decreased (C) decreased, increased\
+  \ (D) decreased, decreased\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on global facts for help. As of 2015, since 1990 forests have increased in Europe\
+  \ and have decreased in Africa and the Americas. The answer is (B).\n\nQ: Which\
+  \ of the following pairs of statements are both true (as of 2019)?\n(A) People tend\
+  \ to be optimistic about their own future and the future of their nation or the\
+  \ world. (B) People tend to be optimistic about their own future but pessimistic\
+  \ about the future of their nation or the world. (C) People tend to be pessimistic\
+  \ about their own future but optimistic about the future of their nation or the\
+  \ world. (D) People tend to be pessimistic about their own future and the future\
+  \ of their nation or the world.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on global facts for help. As of 2019, most people tend to be optimistic\
   \ about their own future but pessimistic about the future of their nation or the\
-  \ world. (C) People tend to be pessimistic about their own future but optimistic\
-  \ about the future of their nation or the world. (D) People tend to be pessimistic\
-  \ about their own future and the future of their nation or the world.\nA: Let's\
-  \ think step by step. We refer to Wikipedia articles on global facts for help. As\
-  \ of 2019, most people tend to be optimistic about their own future but pessimistic\
-  \ about the future of their nation or the world. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_global_facts
+  \ world. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
index 91295fe82fc01cc6592398a2e834ea0bf0a272b8..6573d82c92057182b7a67258135b3839a9160bcb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
@@ -1,54 +1,54 @@
-dataset_name: high_school_biology
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\nQ: In animal cells, which of the following represents the most\
   \ likely pathway that a secretory protein takes as it is synthesized in a cell?\n\
-  (A) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013\
-  rough ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013\
-  plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory\
-  \ vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013secretory\
-  \ vesicle\u2013plasma membrane\nA: Let's think step by step. Protein synthesis starts\
+  (A) Plasma membrane–Golgi apparatus–ribosome–secretory vesicle–rough ER (B) Ribosome–Golgi\
+  \ apparatus–rough ER–secretory vesicle–plasma membrane (C) Plasma membrane–Golgi\
+  \ apparatus–ribosome–secretory vesicle–rough ER (D) Ribosome–rough ER–Golgi apparatus–secretory\
+  \ vesicle–plasma membrane\nA: Let's think step by step. Protein synthesis starts\
   \ at the ribosome, so we can eliminate (A) and (C). The ribosome is often in the\
   \ endoplasmic reticulum and moves from there to the Golgi apparatus, where it is\
   \ modified and packaged into a vesicle. The vesicle then floats to the plasma membrane\
   \ and is secreted. The answer is (D).\n\nQ: A mutation in a bacterial enzyme changed\
   \ a previously polar amino acid into a nonpolar amino acid. This amino acid was\
-  \ located at a site distant from the enzyme\u2019s active site. How might this mutation\
-  \ alter the enzyme\u2019s substrate specificity?\n(A) By changing the enzyme\u2019\
-  s pH optimum (B) By changing the enzyme\u2019s location in the cell (C) By changing\
-  \ the shape of the protein (D) An amino acid change away from the active site cannot\
-  \ alter the enzyme\u2019s substrate specificity.\nA: Let's think step by step. A\
-  \ change in an amino acid leads to a change in the primary structure of the protein.\
-  \ A change in the primary structure may lead to a change in the secondary and the\
-  \ tertiary structure of the protein. A change in the tertiary structure means a\
-  \ change in the shape of the protein, so (C) has to be correct. Since the change\
-  \ does not affect the active site of the enzyme, we do not expect the activity of\
-  \ the enzyme to be affected. The answer is (C).\n\nQ: Which of the following is\
-  \ not a way to form recombinant DNA?\n(A) Translation (B) Conjugation (C) Specialized\
-  \ transduction (D) Transformation\nA: Let's think step by step. The introduction\
-  \ of foreign DNA or RNA into bacteria or eukaryotic cells is a common technique\
-  \ in molecular biology and scientific research. There are multiple ways foreign\
-  \ DNA can be introduced into cells including transformation, transduction, conjugation,\
-  \ and transfection. In contrast, (A) is not a way to form DNA: during translation\
-  \ the ribosomes synthesize proteins from RNA. The answer is (A).\n\nQ: Homologous\
-  \ structures are often cited as evidence for the process of natural selection. All\
-  \ of the following are examples of homologous structures EXCEPT\n(A) the wings of\
-  \ a bird and the wings of a bat (B) the flippers of a whale and the arms of a man\
-  \ (C) the pectoral fins of a porpoise and the flippers of a seal (D) the forelegs\
-  \ of an insect and the forelimbs of a dog\nA: Let's think step by step. \u200B\u200B\
-  Homologous structures are similar physical features in organisms that share a common\
-  \ ancestor \u200B\u200Bbut different functions. Comparisons (B) and (C) are clearly\
-  \ homologous because they share a common ancestor and the structures serve different\
-  \ purposes. Bat wings and birg wings are also homologous, while they are both wings,\
-  \ the forelimbs serve different purposes. Insects and dogs are very far ancestors\
-  \ since one is vertebrate while the other is invertebrate and the forelimbs serve\
-  \ the same purpose, so they are not homologous. The answer is (D).\n\nQ: Which of\
-  \ the following is not known to be involved in the control of cell division?\n(A)\
-  \ Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells\nA: Let's think\
-  \ step by step. Normal cells move through the cell cycle in a regulated way. At\
-  \ the checkpoint stage, they use information about their own internal state and\
-  \ cues from the environment around them to decide whether to proceed with cell division.\
-  \ Cues like these act by changing the activity of core cell cycle regulators inside\
-  \ the cell. The most common regulators are cyclins and cyclin-dependent kinases.\
-  \ Fibroblast cells do not play any role in cell division. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_biology
+  \ located at a site distant from the enzyme’s active site. How might this mutation\
+  \ alter the enzyme’s substrate specificity?\n(A) By changing the enzyme’s pH optimum\
+  \ (B) By changing the enzyme’s location in the cell (C) By changing the shape of\
+  \ the protein (D) An amino acid change away from the active site cannot alter the\
+  \ enzyme’s substrate specificity.\nA: Let's think step by step. A change in an amino\
+  \ acid leads to a change in the primary structure of the protein. A change in the\
+  \ primary structure may lead to a change in the secondary and the tertiary structure\
+  \ of the protein. A change in the tertiary structure means a change in the shape\
+  \ of the protein, so (C) has to be correct. Since the change does not affect the\
+  \ active site of the enzyme, we do not expect the activity of the enzyme to be affected.\
+  \ The answer is (C).\n\nQ: Which of the following is not a way to form recombinant\
+  \ DNA?\n(A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation\n\
+  A: Let's think step by step. The introduction of foreign DNA or RNA into bacteria\
+  \ or eukaryotic cells is a common technique in molecular biology and scientific\
+  \ research. There are multiple ways foreign DNA can be introduced into cells including\
+  \ transformation, transduction, conjugation, and transfection. In contrast, (A)\
+  \ is not a way to form DNA: during translation the ribosomes synthesize proteins\
+  \ from RNA. The answer is (A).\n\nQ: Homologous structures are often cited as evidence\
+  \ for the process of natural selection. All of the following are examples of homologous\
+  \ structures EXCEPT\n(A) the wings of a bird and the wings of a bat (B) the flippers\
+  \ of a whale and the arms of a man (C) the pectoral fins of a porpoise and the flippers\
+  \ of a seal (D) the forelegs of an insect and the forelimbs of a dog\nA: Let's think\
+  \ step by step. ​​Homologous structures are similar physical features in organisms\
+  \ that share a common ancestor ​​but different functions. Comparisons (B) and (C)\
+  \ are clearly homologous because they share a common ancestor and the structures\
+  \ serve different purposes. Bat wings and birg wings are also homologous, while\
+  \ they are both wings, the forelimbs serve different purposes. Insects and dogs\
+  \ are very far ancestors since one is vertebrate while the other is invertebrate\
+  \ and the forelimbs serve the same purpose, so they are not homologous. The answer\
+  \ is (D).\n\nQ: Which of the following is not known to be involved in the control\
+  \ of cell division?\n(A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast\
+  \ cells\nA: Let's think step by step. Normal cells move through the cell cycle in\
+  \ a regulated way. At the checkpoint stage, they use information about their own\
+  \ internal state and cues from the environment around them to decide whether to\
+  \ proceed with cell division. Cues like these act by changing the activity of core\
+  \ cell cycle regulators inside the cell. The most common regulators are cyclins\
+  \ and cyclin-dependent kinases. Fibroblast cells do not play any role in cell division.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
index ce2a26cc2f028eb273f73a8747e0faec43c4a60b..577a48662cd6b6bf28e084ac1e524cd44b172e76 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_chemistry
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\nQ: Which of the following is considered an acid anhydride?\n\
   (A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3\nA: Let's think step by step. An acid anhydride\
   \ is a compound that is derived by removing water from an acid. The chemical formula\
@@ -45,5 +45,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ the acetate ion. The added strong acid, Nitric acid, will react with the conjugate\
   \ base. Therefore the maximum amount of acid that can be added will be equal to\
   \ the amount of acetate ion, or 2 moles. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_chemistry
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
index 16a9f66dfda7acb33f974a57adca2b296339a040..6b0e0c8f58c1a34edcbd08c3dd010120e6ab0480 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_computer_science
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\nQ: Which of the following is an example of the use\
   \ of a device on the Internet of Things (IoT) ?\n(A) A car alerts a driver that\
   \ it is about to hit an object. (B) A hiker uses a G P S watch to keep track of\
@@ -26,9 +26,9 @@ description: "The following are multiple choice questions (with answers) about h
   \ launched from any web sites visited or files downloaded.\nA: Let's think step\
   \ by step. Choice A is incorrect as it only describes network traffic, which an\
   \ anonymous browser does not change. Choice B is correct as it correctly describes\
-  \ how an anonymous browser will prevent saving data on the user\u2019s computer\
-  \ after the session is ended. Choice C is incorrect because an anonymous browser\
-  \ will not prevent logging in to email or social media accounts. Choice D is incorrect\
+  \ how an anonymous browser will prevent saving data on the user’s computer after\
+  \ the session is ended. Choice C is incorrect because an anonymous browser will\
+  \ not prevent logging in to email or social media accounts. Choice D is incorrect\
   \ because an anonymous browser in itself performs no virus protection. The answer\
   \ is (B).\n\nQ: In the program below, the initial value of X is 5 and the initial\
   \ value of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n\
@@ -66,5 +66,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ its value is greater than 100, regardless of the elements in the list. Choice\
   \ D is incorrect because its step 3 does not increment the value of position, so\
   \ it will repeat forever. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_computer_science
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
index 0e7aafcc1237b0aa70fa2beaf36adb3fd6c38327..ca8ec93fbaf2d694dc55907e0f52225588f3be7e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_european_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\nQ: This question refers to the following information.\n\
   Albeit the king's Majesty justly and rightfully is and ought to be the supreme head\
   \ of the Church of England, and so is recognized by the clergy of this realm in\
@@ -34,7 +34,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ the corruption in the Church of England. The answer is (D).\n\nQ: This question\
   \ refers to the following information.\nRead the following excerpt.\nThe revolutionary\
   \ seed had penetrated into every country and spread more or less. It was greatly\
-  \ developed under the r\xE9gime of the military despotism of Bonaparte. His conquests\
+  \ developed under the régime of the military despotism of Bonaparte. His conquests\
   \ displaced a number of laws, institutions, and customs; broke through bonds sacred\
   \ among all nations, strong enough to resist time itself; which is more than can\
   \ be said of certain benefits conferred by these innovators.\nThe monarchs will\
@@ -55,9 +55,9 @@ description: "The following are multiple choice questions (with answers) about h
   Let them maintain religious principles in all their purity, and not allow the faith\
   \ to be attacked and morality interpreted according to the social contract or the\
   \ visions of foolish sectarians.\nLet them suppress Secret Societies; that gangrene\
-  \ of society.\n\u2014Klemens von Metternich, Political Confession of Faith, 1820\n\
-  Which of the following was the greatest cause of the fears expressed by Metternich\
-  \ in the document above?\n(A) The ideas of personal liberty and nationalism conceived\
+  \ of society.\n—Klemens von Metternich, Political Confession of Faith, 1820\nWhich\
+  \ of the following was the greatest cause of the fears expressed by Metternich in\
+  \ the document above?\n(A) The ideas of personal liberty and nationalism conceived\
   \ during the Enlightenment resulted in radical revolutions that could spread throughout\
   \ Europe. (B) The conquest of Europe by Napoleon led to the creation of new factions\
   \ and shifted the European balance of power. (C) The power of monarchs had grown\
@@ -110,15 +110,15 @@ description: "The following are multiple choice questions (with answers) about h
   \ were all turning to the politicians; the famous Nihilists who made Europe tremble-sons\
   \ of village priests, of the lower middle class, of tradesmen-could not rise above\
   \ the idea of national liberation, and seemed to believe that the world would be\
-  \ delivered-when they had killed their despot&\u2026\n\"Foolery! They'll never get\
-  \ out of it with their foolery.\"\nThen, lowering his voice still more, in a few\
-  \ bitter words he described his old dream of fraternity. He had renounced his rank\
-  \ and his fortune; he had gone among workmen, only in the hope of seeing at last\
-  \ the foundation of a new society of labour in common. All the sous in his pockets\
-  \ had long gone to the urchins of the settlement; he had been as tender as a brother\
+  \ delivered-when they had killed their despot&…\n\"Foolery! They'll never get out\
+  \ of it with their foolery.\"\nThen, lowering his voice still more, in a few bitter\
+  \ words he described his old dream of fraternity. He had renounced his rank and\
+  \ his fortune; he had gone among workmen, only in the hope of seeing at last the\
+  \ foundation of a new society of labour in common. All the sous in his pockets had\
+  \ long gone to the urchins of the settlement; he had been as tender as a brother\
   \ with the colliers, smiling at their suspicion, winning them over by his quiet\
   \ workmanlike ways and his dislike of chattering. But decidedly the fusion had not\
-  \ taken place.\nHis voice changed, his eyes grew bright, he fixed them on \xE9tienne,\
+  \ taken place.\nHis voice changed, his eyes grew bright, he fixed them on étienne,\
   \ directly addressing him:\n\"Now, do you understand that? These hatworkers at Marseilles\
   \ who have won the great lottery prize of a hundred thousand francs have gone off\
   \ at once and invested it, declaring that they are going to live without doing anything!\
@@ -127,7 +127,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ out as much as you like against the rich, you haven't got courage enough to give\
   \ back to the poor the money that luck brings you. You will never be worthy of happiness\
   \ as long as you own anything, and your hatred of the bourgeois proceeds solely\
-  \ from an angry desire to be bourgeois yourselves in their place.\"\n\xE9mile Zola,\
+  \ from an angry desire to be bourgeois yourselves in their place.\"\némile Zola,\
   \ French writer, Germinal, 1885\nThe passage displays the direct concern for the\
   \ welfare of the working classes that was typically a part of which movement?\n\
   (A) Capitalist (B) Scientific (C) Communist (D) Existentialist\nA: Let's think step\
@@ -156,13 +156,14 @@ description: "The following are multiple choice questions (with answers) about h
   \ whether Jewish, Christian or Turkish, appear to me no other than human inventions,\
   \ set up to terrify and enslave mankind, and monopolize power and profit.\nI do\
   \ not mean by this declaration to condemn those who believe otherwise; they have\
-  \ the same right to their belief as I have to mine.\n\u2014Thomas Paine, The Age\
-  \ of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes designed\
-  \ a system of checks and balances for government to avoid abuses of power?\n(A)\
-  \ Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\n\
-  A: Let's think step by step. We refer to Wikipedia articles on european history\
-  \ for help. Baron Montesquieu was a 18th centrury French philsopher who wrote extensively\
+  \ the same right to their belief as I have to mine.\n—Thomas Paine, The Age of Reason,\
+  \ 1794–1795\nWhich of the following Enlightenment philosophes designed a system\
+  \ of checks and balances for government to avoid abuses of power?\n(A) Jean Jacques\
+  \ Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft (D) Adam Smith\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on european history for help.\
+  \ Baron Montesquieu was a 18th centrury French philsopher who wrote extensively\
   \ against the monoplization of power and advocated for a system of checks and balances\
   \ in government to prevent the rise of despotism. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_european_history
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
index 42f6c04061828c92585c516e0c234ba54b672a7d..87c278681cfa06c981d5403e31e2fe7ea6a707d0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
@@ -1,63 +1,37 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
-
-
-  Q: Which one of the following items is an example of nonmaterial culture?
-
-  (A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not
-  physical objects. The answer is (C).
-
-
-  Q: During the third stage of the demographic transition model, which of the following
-  is true?
-
-  (A) Birth rates increase and population growth rate is less rapid. (B) Birth rates
-  decline and population growth rate is less rapid. (C) Birth rates increase and population
-  growth rate increases. (D) Birth rates decrease and population growth rate increases.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  The demographic transition model models the five different stages of population
-  growth as a country goes through economic development, where the third stage refers
-  to a period of declining birth rates and lower population growth. The answer is
-  (B).
-
-
-  Q: The practice of hiring a foreign third-party service provider to run an operation
-  is called
-
-  (A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  "Offshoring" literally means to move or base some of the activities or processes
-  of a company to a foreign country. The answer is (B).
-
-
-  Q: Which of the following statements is NOT accurate regarding the services provided
-  by local governments in the United States?
-
-  (A) Duplication of efforts occurs often. (B) Social problems of the central city
-  spill over into the surrounding residential suburbs. (C) Inefficiency in providing
-  services occurs often. (D) One neighborhood''s efforts to reduce pollution are always
-  supported by neighboring communities.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  There may be economic, social or political reasons for two neighboring communities
-  and their local governments not agreeing to pollution reduction efforts initiated
-  by one of them. The answer is (D).
-
-
-  Q: The rate of natural increase of a population is found by subtracting the
-
-  (A) crude death rate from the crude birth date. (B) crude birth rate from the crude
-  death rate. (C) doubling time from the crude birth rate. (D) fertility rate from
-  the crude death rate.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on geography for help.
-  The difference between number of births and deaths gives the population increase
-  at any given time. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_geography
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\nQ: Which one of the following items is an example of nonmaterial\
+  \ culture?\n(A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).\n\
+  A: Let's think step by step. We refer to Wikipedia articles on geography for help.\
+  \ Nonmaterial culture consists of cultural ideas, beliefs or symbols that are not\
+  \ physical objects. The answer is (C).\n\nQ: During the third stage of the demographic\
+  \ transition model, which of the following is true?\n(A) Birth rates increase and\
+  \ population growth rate is less rapid. (B) Birth rates decline and population growth\
+  \ rate is less rapid. (C) Birth rates increase and population growth rate increases.\
+  \ (D) Birth rates decrease and population growth rate increases.\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on geography for help. The demographic\
+  \ transition model models the five different stages of population growth as a country\
+  \ goes through economic development, where the third stage refers to a period of\
+  \ declining birth rates and lower population growth. The answer is (B).\n\nQ: The\
+  \ practice of hiring a foreign third-party service provider to run an operation\
+  \ is called\n(A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on geography for help.\
+  \ \"Offshoring\" literally means to move or base some of the activities or processes\
+  \ of a company to a foreign country. The answer is (B).\n\nQ: Which of the following\
+  \ statements is NOT accurate regarding the services provided by local governments\
+  \ in the United States?\n(A) Duplication of efforts occurs often. (B) Social problems\
+  \ of the central city spill over into the surrounding residential suburbs. (C) Inefficiency\
+  \ in providing services occurs often. (D) One neighborhood's efforts to reduce pollution\
+  \ are always supported by neighboring communities.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on geography for help. There may be economic, social\
+  \ or political reasons for two neighboring communities and their local governments\
+  \ not agreeing to pollution reduction efforts initiated by one of them. The answer\
+  \ is (D).\n\nQ: The rate of natural increase of a population is found by subtracting\
+  \ the\n(A) crude death rate from the crude birth date. (B) crude birth rate from\
+  \ the crude death rate. (C) doubling time from the crude birth rate. (D) fertility\
+  \ rate from the crude death rate.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on geography for help. The difference between number of births and deaths\
+  \ gives the population increase at any given time. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
index 8ec1c5b02c2ef000318169b4ae64097ebf9ca3f0..a0a67146029bf3aa82d025674e35c0bc0ea564c3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
@@ -1,67 +1,43 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
-
-
-  Q: Which of the following best states an argument made by James Madison in The Federalist
-  number 10?
-
-  (A) Honest politicians can prevent factions from developing. (B) Factions are more
-  likely to occur in large republics than in small ones. (C) The negative effects
-  of factionalism can be reduced by a republican government. (D) Free elections are
-  the people''s best defense against factionalism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. In the Federalist number 10, James Madison advocated for a representative
-  republican form of government to guard against factionalism. The answer is (C).
-
-
-  Q: The term "budget deficit" refers to the
-
-  (A) annual increase in federal spending on the military (B) amount of interest on
-  the national debt (C) difference between the initial budget proposals made by the
-  president and Congress (D) amount the government spends in excess of its revenues
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. When the goverment spends more than it earns, their difference is the
-  budget deficit. The answer is (D).
-
-
-  Q: Which of the following statements about cabinet departments is FALSE?
-
-  (A) They are established by the legislative branch. (B) Their members often don''t
-  have much influence over presidential decisions. (C) They cannot all be run by leaders
-  who belong to the same political party the president does. (D) Not every federal
-  agency is a cabinet department.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. There is no law stipulating that some cabinet department leaders have
-  to belong to a political party different from that of the president. The answer
-  is (C).
-
-
-  Q: Which of the following cases established the precedent that a defendant must
-  be informed of the right to remain silent, the right to a lawyer, and protection
-  from self-incrimination?
-
-  (A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v. Arizona
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. In the landmark Miranda v. Arizona in 1966, the US Supreme Court, based
-  on the Fifth and Sixth Amendment of the US Constitution, guaranteed a defendant''s
-  right to an attorney and protection from self-incrimination. The answer is (D).
-
-
-  Q: Uncertainty over the limits to presidential power is caused primarily by the
-  fact that
-
-  (A) the constitutional definition of those powers is broad and unspecific (B) most
-  people agree that the Constitution places too many limits on presidential power
-  (C) the Supreme Court consistently refuses to rule on cases concerning presidential
-  powers (D) constitutional amendments have greatly increased presidential powers
-
-  A: Let''s think step by step. We refer to Wikipedia articles on government and politics
-  for help. The US Constitution is not very specific about the powers of the president,
-  leading to uncertainty over its limits. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_government_and_politics
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\nQ: Which of the following best states an argument\
+  \ made by James Madison in The Federalist number 10?\n(A) Honest politicians can\
+  \ prevent factions from developing. (B) Factions are more likely to occur in large\
+  \ republics than in small ones. (C) The negative effects of factionalism can be\
+  \ reduced by a republican government. (D) Free elections are the people's best defense\
+  \ against factionalism.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on government and politics for help. In the Federalist number 10, James Madison\
+  \ advocated for a representative republican form of government to guard against\
+  \ factionalism. The answer is (C).\n\nQ: The term \"budget deficit\" refers to the\n\
+  (A) annual increase in federal spending on the military (B) amount of interest on\
+  \ the national debt (C) difference between the initial budget proposals made by\
+  \ the president and Congress (D) amount the government spends in excess of its revenues\n\
+  A: Let's think step by step. We refer to Wikipedia articles on government and politics\
+  \ for help. When the goverment spends more than it earns, their difference is the\
+  \ budget deficit. The answer is (D).\n\nQ: Which of the following statements about\
+  \ cabinet departments is FALSE?\n(A) They are established by the legislative branch.\
+  \ (B) Their members often don't have much influence over presidential decisions.\
+  \ (C) They cannot all be run by leaders who belong to the same political party the\
+  \ president does. (D) Not every federal agency is a cabinet department.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on government and politics\
+  \ for help. There is no law stipulating that some cabinet department leaders have\
+  \ to belong to a political party different from that of the president. The answer\
+  \ is (C).\n\nQ: Which of the following cases established the precedent that a defendant\
+  \ must be informed of the right to remain silent, the right to a lawyer, and protection\
+  \ from self-incrimination?\n(A) Weeks v. United States (B) Betts v. Brady (C) Mapp\
+  \ v. Ohio (D) Miranda v. Arizona\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on government and politics for help. In the landmark Miranda v. Arizona\
+  \ in 1966, the US Supreme Court, based on the Fifth and Sixth Amendment of the US\
+  \ Constitution, guaranteed a defendant's right to an attorney and protection from\
+  \ self-incrimination. The answer is (D).\n\nQ: Uncertainty over the limits to presidential\
+  \ power is caused primarily by the fact that\n(A) the constitutional definition\
+  \ of those powers is broad and unspecific (B) most people agree that the Constitution\
+  \ places too many limits on presidential power (C) the Supreme Court consistently\
+  \ refuses to rule on cases concerning presidential powers (D) constitutional amendments\
+  \ have greatly increased presidential powers\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on government and politics for help. The US Constitution\
+  \ is not very specific about the powers of the president, leading to uncertainty\
+  \ over its limits. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
index f47a83e65bdc07f1144cb3b07178379d73cc31b5..c82b0739a034fb85e3dc85e50dde40b5b33f2f06 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
@@ -1,64 +1,37 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
-
-
-  Q: Which of the following policies best describes supply-side fiscal policy?
-
-  (A) An increase in the money supply (B) Increased government spending (C) Lower
-  taxes on research and development of new technology (D) Higher taxes on household
-  income
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. Supply-side fiscal policy stimulates the economy by encouraging more production
-  of goods and services through reduction in taxes and deregulation. The answer is
-  (C).
-
-
-  Q: The short-run Phillips curve indicates a
-
-  (A) direct relation between unemployment and inflation (B) direct relation between
-  price and quantity demanded (C) inverse relation between price and quantity demanded
-  (D) inverse relation between unemployment and inflation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. The short-run Phillips curve shows that whenever unemployment decreases below
-  a natural level, the inflation starts increasing, and vice-versa. The answer is
-  (D).
-
-
-  Q: Holding all else equal which of the following monetary policies would be used
-  to boost U.S. exports?
-
-  (A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying government
-  securities (D) Lowering tariffs
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. Buying government securities leads to reduction in demand for US dollars from
-  foreign buyers, thereby making it cheaper and hence making US exports more attractive.
-  The answer is (C).
-
-
-  Q: A federal deficit occurs when
-
-  (A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections
-  exceed spending. (D) federal spending exceeds federal tax revenues.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. A federal deficit occurs when federal spending exceeds federal income which
-  is primarily from tax revenues. The answer is (D).
-
-
-  Q: Which of the following is not included in the U.S. GDP?
-
-  (A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel.
-  (B) Japanese consumers buy thousands of CDs produced in the United States. (C) An
-  American pop singer performs a sold-out concert in Paris. (D) A French theatrical
-  production tours dozens of American cities.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on macroeconomics for
-  help. The economic transactions related to the performance of the American pop-singer
-  in Paris happens entirely outside the U.S. and hence is not included in the GDP
-  numbers. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_macroeconomics
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\nQ: Which of the following policies best describes supply-side\
+  \ fiscal policy?\n(A) An increase in the money supply (B) Increased government spending\
+  \ (C) Lower taxes on research and development of new technology (D) Higher taxes\
+  \ on household income\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on macroeconomics for help. Supply-side fiscal policy stimulates the economy by\
+  \ encouraging more production of goods and services through reduction in taxes and\
+  \ deregulation. The answer is (C).\n\nQ: The short-run Phillips curve indicates\
+  \ a\n(A) direct relation between unemployment and inflation (B) direct relation\
+  \ between price and quantity demanded (C) inverse relation between price and quantity\
+  \ demanded (D) inverse relation between unemployment and inflation\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on macroeconomics for help. The short-run\
+  \ Phillips curve shows that whenever unemployment decreases below a natural level,\
+  \ the inflation starts increasing, and vice-versa. The answer is (D).\n\nQ: Holding\
+  \ all else equal which of the following monetary policies would be used to boost\
+  \ U.S. exports?\n(A) Increasing the discount rate (B) Increasing the reserve ratio\
+  \ (C) Buying government securities (D) Lowering tariffs\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on macroeconomics for help. Buying government\
+  \ securities leads to reduction in demand for US dollars from foreign buyers, thereby\
+  \ making it cheaper and hence making US exports more attractive. The answer is (C).\n\
+  \nQ: A federal deficit occurs when\n(A) exports exceed imports. (B) imports exceed\
+  \ exports. (C) federal tax collections exceed spending. (D) federal spending exceeds\
+  \ federal tax revenues.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on macroeconomics for help. A federal deficit occurs when federal spending exceeds\
+  \ federal income which is primarily from tax revenues. The answer is (D).\n\nQ:\
+  \ Which of the following is not included in the U.S. GDP?\n(A) The U.S. military\
+  \ opens a new base in a foreign country with 1000 U.S. personnel. (B) Japanese consumers\
+  \ buy thousands of CDs produced in the United States. (C) An American pop singer\
+  \ performs a sold-out concert in Paris. (D) A French theatrical production tours\
+  \ dozens of American cities.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on macroeconomics for help. The economic transactions related to the\
+  \ performance of the American pop-singer in Paris happens entirely outside the U.S.\
+  \ and hence is not included in the GDP numbers. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
index eb692a09d3817781242b29f73af3296057744f96..a73a8290f3270ed1b78cf85186fd75bfe57762d6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_mathematics
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\nQ: Simplify and write the result with a rational denominator:\
   \ $$\\sqrt{\\sqrt[3]{\\sqrt{\\frac{1}{729}}}}$$\n(A) \\frac{3\\sqrt{3}}{3} (B) \\\
   frac{1}{3} (C) \\sqrt{3} (D) \\frac{\\sqrt{3}}{3}\nA: Let's think step by step.\
@@ -13,7 +13,7 @@ description: "The following are multiple choice questions (with answers) about h
   \ of $9600/300=32=2^5$. Since at this interest rate it takes six years for it to\
   \ double, it will take $5*6=30$ years to grow to $\\$9600$. The answer is (C).\n\
   \nQ: Ten students take a biology test and receive the following scores: 45, 55,\
-  \ 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019 test scores?\n\
+  \ 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students’ test scores?\n\
   (A) 55 (B) 60 (C) 62 (D) 65\nA: Let's think step by step. There are 10 students\
   \ and the sum of their scores is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85\
   \ = 650$, the mean is $650/10=65$. The answer is (D).\n\nQ: The variable $x$ varies\
@@ -32,5 +32,6 @@ description: "The following are multiple choice questions (with answers) about h
   \ dance.)\n(A) 3 (B) 15 (C) 6 (D) 5\nA: Let's think step by step. The least common\
   \ multiple of 2, 3 and 5 is 30, so during a 7 minute dance, all the three lights\
   \ will come on at the same time $2*7+1=15$ times. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_mathematics
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
index 86c83c8242202049fb977a24be30821ea84e908a..8c000e6d0d527671a0def0a41d9f23dbddde2e5d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
@@ -1,63 +1,40 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
-
-
-  Q: Which of the following is necessarily a characteristic of oligopoly?
-
-  (A) Free entry into and exit from the market (B) A few large producers (C) One producer
-  of a good with no close substitutes (D) A homogenous product
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. An oligopoly is when a market is dominated by just one or a few number of
-  sellers or producers. To get oligopoly, the market should have high barriers to
-  new entry, and the product has differentiation. The answer is (B).
-
-
-  Q: If the government subsidizes producers in a perfectly competitive market, then
-
-  (A) the demand for the product will increase (B) the demand for the product will
-  decrease (C) the consumer surplus will increase (D) the consumer surplus will decrease
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. (A) and (B) are wrong because the demand curve does not change at all. If
-  the government subsidizes producers, the supply will increase, and thus the consumer
-  surplus also increases. The answer is (C).
-
-
-  Q: Which of the following is true of a price floor?
-
-  (A) The price floor shifts the demand curve to the left. (B) An effective floor
-  creates a shortage of the good. (C) The price floor shifts the supply curve of the
-  good to the right. (D) To be an effective floor, it must be set above the equilibrium
-  price.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. Price floor does not shift the demand or shift curve. An effective price floor
-  should be set above the equilibrium price, otherwise the market bears and the floor
-  does not have effective effect. The answer is (D).
-
-
-  Q: The concentration ratio for a monopoly is
-
-  (A) 0 (B) 5 (C) 10 (D) 100
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. The concentration ratio is calculated as the sum of market share of a specific
-  number of largest companies. Monopoly means one company or entity controls the entire
-  market, therefore, the concentration ratio is 100 percent. The answer is (D).
-
-
-  Q: In a competitive labor market for housepainters, which of the following would
-  increase the demand for housepainters?
-
-  (A) An effective minimum wage imposed on this labor market. (B) An increase in the
-  price of gallons of paint. (C) An increase in the construction of new houses. (D)
-  An increase in the price of mechanical painters so long as the output effect exceeds
-  the substitution effect.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on microeconomics for
-  help. An increase in the construction of new houses means an increase demand of
-  in-house painting, thus increases the demand for housepainters. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_microeconomics
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\nQ: Which of the following is necessarily a characteristic\
+  \ of oligopoly?\n(A) Free entry into and exit from the market (B) A few large producers\
+  \ (C) One producer of a good with no close substitutes (D) A homogenous product\n\
+  A: Let's think step by step. We refer to Wikipedia articles on microeconomics for\
+  \ help. An oligopoly is when a market is dominated by just one or a few number of\
+  \ sellers or producers. To get oligopoly, the market should have high barriers to\
+  \ new entry, and the product has differentiation. The answer is (B).\n\nQ: If the\
+  \ government subsidizes producers in a perfectly competitive market, then\n(A) the\
+  \ demand for the product will increase (B) the demand for the product will decrease\
+  \ (C) the consumer surplus will increase (D) the consumer surplus will decrease\n\
+  A: Let's think step by step. We refer to Wikipedia articles on microeconomics for\
+  \ help. (A) and (B) are wrong because the demand curve does not change at all. If\
+  \ the government subsidizes producers, the supply will increase, and thus the consumer\
+  \ surplus also increases. The answer is (C).\n\nQ: Which of the following is true\
+  \ of a price floor?\n(A) The price floor shifts the demand curve to the left. (B)\
+  \ An effective floor creates a shortage of the good. (C) The price floor shifts\
+  \ the supply curve of the good to the right. (D) To be an effective floor, it must\
+  \ be set above the equilibrium price.\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on microeconomics for help. Price floor does not shift the\
+  \ demand or shift curve. An effective price floor should be set above the equilibrium\
+  \ price, otherwise the market bears and the floor does not have effective effect.\
+  \ The answer is (D).\n\nQ: The concentration ratio for a monopoly is\n(A) 0 (B)\
+  \ 5 (C) 10 (D) 100\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on microeconomics for help. The concentration ratio is calculated as the sum of\
+  \ market share of a specific number of largest companies. Monopoly means one company\
+  \ or entity controls the entire market, therefore, the concentration ratio is 100\
+  \ percent. The answer is (D).\n\nQ: In a competitive labor market for housepainters,\
+  \ which of the following would increase the demand for housepainters?\n(A) An effective\
+  \ minimum wage imposed on this labor market. (B) An increase in the price of gallons\
+  \ of paint. (C) An increase in the construction of new houses. (D) An increase in\
+  \ the price of mechanical painters so long as the output effect exceeds the substitution\
+  \ effect.\nA: Let's think step by step. We refer to Wikipedia articles on microeconomics\
+  \ for help. An increase in the construction of new houses means an increase demand\
+  \ of in-house painting, thus increases the demand for housepainters. The answer\
+  \ is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
index f21a183c2ff62fb6d7fa7166d9d174f51e477b4f..92963bd6d8430c452e74404c30f39167c9051a4b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
@@ -1,38 +1,39 @@
-dataset_name: high_school_physics
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\nQ: A microwave oven is connected to an outlet, 120 V, and draws\
   \ a current of 2 amps. At what rate is energy being used by the microwave oven?\n\
   (A) 10 W (B) 30 W (C) 60 W (D) 240 W\nA: Let's think step by step. Rate of energy\
   \ usage is known as power; in an dissipative electrical circuit, power is given\
   \ by voltage times current. So in our case, the power is 120 V times 2 amps, or\
   \ 240 W. The answer is (D).\n\nQ: A point charge, Q = +1 mC, is fixed at the origin.\
-  \ How much work is required to move a charge, Q = +8 \xB5C, from the point (0, 4\
-  \ meters) to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\n\
-  A: Let's think step by step. To calculate the work required to move a charge from\
-  \ one location to another in a fixed electric field, it is enough to calculate the\
-  \ potential difference between the two locations. Here, the potential only depends\
-  \ on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where $k$ is Coulomb\u2019\
-  s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer\
-  \ as 5.992 J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following\
-  \ conditions will ensure that angular momentum is conserved? I. Conservation of\
-  \ linear momentum II. Zero net external force III. Zero net external torque\n(A)\
-  \ I and II only (B) I and III only (C) II and III only (D) III only\nA: Let's think\
-  \ step by step. Torque is defined as the change in angular momentum; if there is\
-  \ zero external torque, angular momentum is conserved. The answer is (D).\n\nQ:\
-  \ A photocell of work function \u03D5 = 2eV is connected to a resistor in series.\
-  \ Light of frequency f = 1 \xD7 10^15 Hz hits a metal plate of the photocell. If\
-  \ the power of the light is P = 100 W, what is the current through the resistor?\n\
-  (A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The\
-  \ only answer above which has units of current is D, 24 A. The answer is (D).\n\n\
-  Q: A pipe full of air is closed at one end. A standing wave is produced in the pipe,\
-  \ causing the pipe to sound a note. Which of the following is a correct statement\
-  \ about the wave\u2019s properties at the closed end of the pipe?\n(A) The pressure\
-  \ is at a node, but the particle displacement is at an antinode. (B) The pressure\
-  \ is at an antinode, but the particle displacement is at a node. (C) The pressure\
-  \ and the particle displacement are both at nodes. (D) The pressure and the particle\
-  \ displacement are both at antinodes.\nA: Let's think step by step. At the closed\
-  \ end of the pipe, the particles cannot have any net displacement because the pipe\
-  \ closure stops them. So the particle displacement is at a node. This closure also\
-  \ causes the pressure to be maximal, i.e. an antinode. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_physics
+  \ How much work is required to move a charge, Q = +8 µC, from the point (0, 4 meters)\
+  \ to the point (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J\nA: Let's\
+  \ think step by step. To calculate the work required to move a charge from one location\
+  \ to another in a fixed electric field, it is enough to calculate the potential\
+  \ difference between the two locations. Here, the potential only depends on the\
+  \ distance between the charges; it’s $k q_1 q_2 / r$, where $k$ is Coulomb’s constant.\
+  \ Plugging in values $q_1 = $ 1 mC, $q_2 = 8 \\mu$ C, gives the answer as 5.992\
+  \ J, which rounds to 6 J. The answer is (B).\n\nQ: Which of the following conditions\
+  \ will ensure that angular momentum is conserved? I. Conservation of linear momentum\
+  \ II. Zero net external force III. Zero net external torque\n(A) I and II only (B)\
+  \ I and III only (C) II and III only (D) III only\nA: Let's think step by step.\
+  \ Torque is defined as the change in angular momentum; if there is zero external\
+  \ torque, angular momentum is conserved. The answer is (D).\n\nQ: A photocell of\
+  \ work function ϕ = 2eV is connected to a resistor in series. Light of frequency\
+  \ f = 1 × 10^15 Hz hits a metal plate of the photocell. If the power of the light\
+  \ is P = 100 W, what is the current through the resistor?\n(A) 2:00 AM (B) 6:00\
+  \ AM (C) 12:00 AM (D) 24 A\nA: Let's think step by step. The only answer above which\
+  \ has units of current is D, 24 A. The answer is (D).\n\nQ: A pipe full of air is\
+  \ closed at one end. A standing wave is produced in the pipe, causing the pipe to\
+  \ sound a note. Which of the following is a correct statement about the wave’s properties\
+  \ at the closed end of the pipe?\n(A) The pressure is at a node, but the particle\
+  \ displacement is at an antinode. (B) The pressure is at an antinode, but the particle\
+  \ displacement is at a node. (C) The pressure and the particle displacement are\
+  \ both at nodes. (D) The pressure and the particle displacement are both at antinodes.\n\
+  A: Let's think step by step. At the closed end of the pipe, the particles cannot\
+  \ have any net displacement because the pipe closure stops them. So the particle\
+  \ displacement is at a node. This closure also causes the pressure to be maximal,\
+  \ i.e. an antinode. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
index 706db0ecbbd0cb4bfefa6fc8c0b780661f9b879a..b54a6c384a53a119de1e00a5191cc2d686fe558e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
@@ -1,72 +1,48 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
-
-
-  Q: Pascale is interested in the processing strategies children use to learn new
-  information. Pascale would best be classified as what type of psychologist?
-
-  (A) sociocultural (B) clinical (C) cognitive (D) behaviorist
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Sociocultural psychologist focuses on the effect of societal factors on people.
-  Clinical psychologist focuses on people with mental issues. Cognitive psychologist
-  focuses on how people think and learn, including the processing strategies. Behaviorist
-  focuses more on the environment and experience effect on people. The answer is (C).
-
-
-  Q: According to Caplan''s model of consultee-centered case consultation, the consultant
-  is primarily interested in
-
-  (A) identifying the causes and solutions of the client''s presenting problems (B)
-  identifying and eliminating the causes of the consultee''s difficulties in handling
-  a problem (C) establishing a hierarchy of authority to enable effective decision
-  making (D) presenting a single, well-defined and unambiguous course of action for
-  the consultant to overcome skills deficits
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Caplan defines two type of consultation. Client-centered case consultation aims
-  to handle client''s problems, while consultee-centered case consultation aims to
-  identify the reason of client''s difficulty to solve problems. The answer is (B).
-
-
-  Q: According to the Individuals with Disabilities Education Improvement Act, which
-  of the following must an educational agency do before it changes the educational
-  placement of a student with a disability?
-
-  (A) Give the child a trial period in the new environment (B) Notify the parents
-  in writing (C) Obtain school board approval (D) Obtain parental consent
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  When the decision to change the educational placement of a student with a disability
-  is made, the educational agency must notify the parents in writing on that date.
-  The answer is (B).
-
-
-  Q: While swimming in the ocean, Ivan is frightened by a dark shadow in the water
-  even before he has the chance to identify what the shadow is. The synaptic connections
-  taking place during this incident of fright are best described by which of the following?
-
-  (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are
-  sent from the thalamus to the "what" and "where" pathways. (C) Messages are sent
-  from the parasympathetic nervous system to the cerebral cortex. (D) Messages are
-  sent from the frontal lobes to the pituitary gland.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  Our neural system has a mechanism that can respond immediate emotional signal before
-  going to the thought center. In the Ivan''s case, messages travel directly from
-  thalamus to amygdala. The answer is (A).
-
-
-  Q: Ani believes that her attitudes and behavior play a central role in what happens
-  to her. Such a belief is likely to be associated with
-
-  (A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal
-  locus of control.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on psychology for help.
-  People with an external locus of control believes fate and luck play an important
-  role in their lives, while people with an internal locus of control believes they
-  control their lives. The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_psychology
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\nQ: Pascale is interested in the processing strategies children\
+  \ use to learn new information. Pascale would best be classified as what type of\
+  \ psychologist?\n(A) sociocultural (B) clinical (C) cognitive (D) behaviorist\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Sociocultural psychologist focuses on the effect of societal factors on people.\
+  \ Clinical psychologist focuses on people with mental issues. Cognitive psychologist\
+  \ focuses on how people think and learn, including the processing strategies. Behaviorist\
+  \ focuses more on the environment and experience effect on people. The answer is\
+  \ (C).\n\nQ: According to Caplan's model of consultee-centered case consultation,\
+  \ the consultant is primarily interested in\n(A) identifying the causes and solutions\
+  \ of the client's presenting problems (B) identifying and eliminating the causes\
+  \ of the consultee's difficulties in handling a problem (C) establishing a hierarchy\
+  \ of authority to enable effective decision making (D) presenting a single, well-defined\
+  \ and unambiguous course of action for the consultant to overcome skills deficits\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Caplan defines two type of consultation. Client-centered case consultation aims\
+  \ to handle client's problems, while consultee-centered case consultation aims to\
+  \ identify the reason of client's difficulty to solve problems. The answer is (B).\n\
+  \nQ: According to the Individuals with Disabilities Education Improvement Act, which\
+  \ of the following must an educational agency do before it changes the educational\
+  \ placement of a student with a disability?\n(A) Give the child a trial period in\
+  \ the new environment (B) Notify the parents in writing (C) Obtain school board\
+  \ approval (D) Obtain parental consent\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on psychology for help. When the decision to change the educational\
+  \ placement of a student with a disability is made, the educational agency must\
+  \ notify the parents in writing on that date. The answer is (B).\n\nQ: While swimming\
+  \ in the ocean, Ivan is frightened by a dark shadow in the water even before he\
+  \ has the chance to identify what the shadow is. The synaptic connections taking\
+  \ place during this incident of fright are best described by which of the following?\n\
+  (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages are\
+  \ sent from the thalamus to the \"what\" and \"where\" pathways. (C) Messages are\
+  \ sent from the parasympathetic nervous system to the cerebral cortex. (D) Messages\
+  \ are sent from the frontal lobes to the pituitary gland.\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on psychology for help. Our neural system\
+  \ has a mechanism that can respond immediate emotional signal before going to the\
+  \ thought center. In the Ivan's case, messages travel directly from thalamus to\
+  \ amygdala. The answer is (A).\n\nQ: Ani believes that her attitudes and behavior\
+  \ play a central role in what happens to her. Such a belief is likely to be associated\
+  \ with\n(A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an\
+  \ internal locus of control.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on psychology for help. People with an external locus of control believes\
+  \ fate and luck play an important role in their lives, while people with an internal\
+  \ locus of control believes they control their lives. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
index 37e210612a74a8787a95894717d4db280573645e..918f6ac3975b8463393d995b129a63445ba89992 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
@@ -1,88 +1,58 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
-
-
-  Q: A new smartwatch is manufactured in one part of a factory, then secured for shipping
-  in another, independent part of the factory. The weight of the smartwatch has a
-  mean of 62 grams and a standard deviation of 1.0 grams. The weight of the packaging
-  (box, user''s guide, bubble wrap, etc.) has a mean of 456 grams and a standard deviation
-  of 6 grams. Together, the distribution of the weight of the smartwatch and its packaging
-  would have the following mean and standard deviation:
-
-  (A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard deviation
-  3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean 394 grams; standard
-  deviation 6.1 grams
-
-  A: Let''s think step by step. Since the weight of the watch and the weight of the
-  packaging are independent random variables, the mean and variance of their sum is
-  equal to the sum of their individual means and variances. So the mean is 62 + 456
-  = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading to a standard deviation
-  of 6.1 grams. The answer is (C).
-
-
-  Q: After a frost warning was issued, the owner of a large orange grove asked his
-  workers to spray all his trees with water. The water was supposed to freeze and
-  form a protective covering of ice around the orange blossom. Nevertheless, the owner
-  suspected that some trees suffered considerable damage due to the frost. To estimate
-  the proportion of trees that suffered more than 50 percent damage due to the frost,
-  he took a random sample of 100 trees from his grove. What is the response variable
-  in this experiment?
-
-  (A) The proportion of trees that suffered more than 50 percent damage due to frost.
-  (B) The number of trees affected by the frost. (C) The number of trees sampled from
-  the grove. (D) For each sampled tree, whether it suffered more than 50 percent damage
-  or at most 50 percent damage.
-
-  A: Let''s think step by step. In this experiment, the response variable is what
-  is measured. For each tree, what is measured is whether or not it suffered more
-  than 50 percent damage due to the frost. The answer is (D).
-
-
-  Q: Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and
-  var(Y) = 12. What are the expected value and variance of the random variable X +
-  Y?
-
-  (A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X +
-  Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this question.
-
-  A: Let''s think step by step. While means of sums of random variables add (regardless
-  of whether the variables are independent) in order to determine the variance of
-  a sum of random variables, we need to know not just their individual variances but
-  the covariance of the two variables, which is not given in this problem. The answer
-  is (D).
-
-
-  Q: Which of the following sets has the smallest standard deviation? Which has the
-  largest?
-
-  I: {1,2,3}
-
-  II: {-10,10}
-
-  III: {100}
-
-  (A) I, II (B) II, III (C) III, I (D) III, II
-
-  A: Let''s think step by step. The variance of distribution I is the expected squared
-  deviation from its mean (which is 2), so the variance is 2/3 . The variance of distribution
-  II is 10^2 (because both elements are 10 away from the mean of zero). The variance
-  of distribution III is 0, since it has a single entry. So distribution III has the
-  smallest standard deviation and distribution II has the largest. The answer is (D).
-
-
-  Q: Which of the following is a correct statement about correlation?
-
-  (A) If the slope of the regression line is exactly 1, then the correlation is exactly
-  1. (B) If the correlation is 0, then the slope of the regression line is undefined.
-  (C) Switching which variable is called x and which is called y changes the sign
-  of the correlation. (D) The correlation r is equal to the slope of the regression
-  line when z-scores for the y-variable are plotted against z-scores for the x-variable.
-
-  A: Let''s think step by step. Statement A is false because the slope of the regression
-  line being exactly 1 can occur even when the two variables are not perfectly correlated.
-  Statement B is false because uncorrelated variables regression lines can have slope
-  zero. Statement C is false because correlation is symmetric in the two random variables.
-  The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_statistics
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\nQ: A new smartwatch is manufactured in one part of a factory,\
+  \ then secured for shipping in another, independent part of the factory. The weight\
+  \ of the smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams.\
+  \ The weight of the packaging (box, user's guide, bubble wrap, etc.) has a mean\
+  \ of 456 grams and a standard deviation of 6 grams. Together, the distribution of\
+  \ the weight of the smartwatch and its packaging would have the following mean and\
+  \ standard deviation:\n(A) Mean 518 grams; standard deviation 7.0 grams (B) Mean\
+  \ 518 grams; standard deviation 3.5 grams (C) Mean 518 grams; standard deviation\
+  \ 6.1 grams (D) Mean 394 grams; standard deviation 6.1 grams\nA: Let's think step\
+  \ by step. Since the weight of the watch and the weight of the packaging are independent\
+  \ random variables, the mean and variance of their sum is equal to the sum of their\
+  \ individual means and variances. So the mean is 62 + 456 = 518 grams, and the variances\
+  \ is 1.0^2 + 6.0^2 = 37, leading to a standard deviation of 6.1 grams. The answer\
+  \ is (C).\n\nQ: After a frost warning was issued, the owner of a large orange grove\
+  \ asked his workers to spray all his trees with water. The water was supposed to\
+  \ freeze and form a protective covering of ice around the orange blossom. Nevertheless,\
+  \ the owner suspected that some trees suffered considerable damage due to the frost.\
+  \ To estimate the proportion of trees that suffered more than 50 percent damage\
+  \ due to the frost, he took a random sample of 100 trees from his grove. What is\
+  \ the response variable in this experiment?\n(A) The proportion of trees that suffered\
+  \ more than 50 percent damage due to frost. (B) The number of trees affected by\
+  \ the frost. (C) The number of trees sampled from the grove. (D) For each sampled\
+  \ tree, whether it suffered more than 50 percent damage or at most 50 percent damage.\n\
+  A: Let's think step by step. In this experiment, the response variable is what is\
+  \ measured. For each tree, what is measured is whether or not it suffered more than\
+  \ 50 percent damage due to the frost. The answer is (D).\n\nQ: Suppose X and Y are\
+  \ random variables with E(X) = 37, var(X) = 5, E(Y) = 62, and var(Y) = 12. What\
+  \ are the expected value and variance of the random variable X + Y?\n(A) E(X + Y)\
+  \ = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X + Y) = 99, var(X\
+  \ + Y) = 17 (D) There is insufficient information to answer this question.\nA: Let's\
+  \ think step by step. While means of sums of random variables add (regardless of\
+  \ whether the variables are independent) in order to determine the variance of a\
+  \ sum of random variables, we need to know not just their individual variances but\
+  \ the covariance of the two variables, which is not given in this problem. The answer\
+  \ is (D).\n\nQ: Which of the following sets has the smallest standard deviation?\
+  \ Which has the largest?\nI: {1,2,3}\nII: {-10,10}\nIII: {100}\n(A) I, II (B) II,\
+  \ III (C) III, I (D) III, II\nA: Let's think step by step. The variance of distribution\
+  \ I is the expected squared deviation from its mean (which is 2), so the variance\
+  \ is 2/3 . The variance of distribution II is 10^2 (because both elements are 10\
+  \ away from the mean of zero). The variance of distribution III is 0, since it has\
+  \ a single entry. So distribution III has the smallest standard deviation and distribution\
+  \ II has the largest. The answer is (D).\n\nQ: Which of the following is a correct\
+  \ statement about correlation?\n(A) If the slope of the regression line is exactly\
+  \ 1, then the correlation is exactly 1. (B) If the correlation is 0, then the slope\
+  \ of the regression line is undefined. (C) Switching which variable is called x\
+  \ and which is called y changes the sign of the correlation. (D) The correlation\
+  \ r is equal to the slope of the regression line when z-scores for the y-variable\
+  \ are plotted against z-scores for the x-variable.\nA: Let's think step by step.\
+  \ Statement A is false because the slope of the regression line being exactly 1\
+  \ can occur even when the two variables are not perfectly correlated. Statement\
+  \ B is false because uncorrelated variables regression lines can have slope zero.\
+  \ Statement C is false because correlation is symmetric in the two random variables.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
index 951666d10d6874ca3f2b4b221fc7e0b10a35fdfa..d8d0bfbbe058cdf97f5e07ce5eec539f33d38d03 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_us_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\nQ: This question refers to the following information.\n\
   I come not to urge personal claims, nor to seek individual benefits; I appear as\
   \ the advocate of those who cannot plead their own cause; I come as the friend of\
@@ -8,126 +8,127 @@ description: "The following are multiple choice questions (with answers) about h
   \ jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed\
   \ beings who pine in the cells, and stalls, and cages, and waste rooms of your poor-houses.\
   \ I am the Revelation of hundreds of wailing, suffering creatures, hidden in your\
-  \ private dwellings, and in pens and cabins\u2014shut out, cut off from all healing\
-  \ influences, from all mind-restoring cares.\u2026 Could their melancholy histories\
-  \ be spread before you as revealed to my grieved spirit during the last three months,\
-  \ how promptly, how earnestly would you search out the most approved means of relief;\
-  \ how trifling, how insignificant, by comparison, would appear the sacrifices you\
-  \ are asked to make; how would a few dimes and dollars, gathered from each citizen,\
-  \ diminish in value as a possession, compared with the certain benefits and vast\
-  \ good to be secured for the suffering insane...by the consecration and application\
-  \ of a sufficient fund to the construction of a suitable hospital.\u2026\n\u2014\
-  Dorothea Dix, Memorial Soliciting a State Hospital for the Protection and Cure of\
-  \ the Insane,\nSubmitted to the General Assembly of North Carolina, November 1848\n\
-  Dorothea Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C)\
-  \ Shirley Temple (D) Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on us history for help. Both Dorothea Dix and Clara barton are American\
-  \ nurses. The answer is (B).\n\nQ: This question refers to the following information.\n\
-  \"As our late Conduct at the Conestoga Manor and Lancaster have occasioned much\
-  \ Speculation & a great diversity of Sentiments in this and neighboring Governments;\
-  \ some vindicating & others condemning it; some charitably alleviating the Crime,\
-  \ & others maliciously painting it in the most odious & detestable Colours, we think\
-  \ it our duty to lay before the Publick, the whole Matter as it appeared, & still\
-  \ appears, to us. . . .\n\"If these things are not sufficient to prove an unjustifiable\
-  \ Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend\
-  \ them & an utter insensibility to human Distresses, let us consider a few more\
-  \ recent Facts. When we found the last Summer that we were likely to get no Assistance\
-  \ from the Government, some Volunteers went out at our own Expense, determined to\
-  \ drive our Enemies from our Borders; & when we came near to the great Island, we\
-  \ understood that a Number of their Warriors had gone out against our Frontiers.\
-  \ Upon this we returned and came up with them and fought with them at the Munfey\
-  \ Hill where we lost some of our Men & killed some of their Warriors & thereby saved\
-  \ our Frontiers from this Story in another Expedition. But no sooner had we destroyed\
-  \ their Provisions on the great Island, & ruined their trade with the good People\
-  \ at Bethlehem, but these very Indians, who were justly suspected of having murdered\
-  \ our Friends in Northampton County, were by the Influence of some Quakers taken\
-  \ under the Protection of the Government to screen them from the Resentments of\
-  \ the Friends and Relations of the Murdered, & to support them thro the Winter.\"\
-  \n\u2014\"Apology of the Paxton Boys\" (pamphlet), 1764 (Note: \"apology\" in this\
-  \ context should be read as an explanation, not an admission of guilt or regret.\n\
-  The sentiments expressed in the explanation above reflect which of the ongoing tensions\
-  \ during the colonial period of American history?\n(A) Tensions between British\
-  \ policies and the aspirations of North American colonists. (B) Tensions between\
-  \ American Indians allied with the French and those allied with the British. (C)\
-  \ Tensions between freed African Americans and white planters. (D) Tensions between\
-  \ backcountry settlers and elites within colonial America.\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on us history for help. After the French\
-  \ and Indian War, the Scotch-Irish settlers attacked American Indians. After the\
-  \ attacks on the Conestoga, about 250 Paxton Boys present their grievances to the\
-  \ Pennsylvania legislature. As mentioned in the information, the Paxton Boys cited\
-  \ resentiment at local elites. The answer is (D).\n\nQ: This question refers to\
-  \ the following information.\nOur leaders talk about stopping aggression from the\
-  \ north, but this was a struggle among groups of Vietnamese until we intervened.\
-  \ We seem bent upon saving the Vietnamese from Ho Chi Minh even if we have to kill\
-  \ them and demolish their country to do it. As the native people survey bombed-out\
-  \ villages, women and children burned by napalm, rice crops destroyed and cities\
-  \ overrun with our military personnel, they are doubtless saying secretly of the\
-  \ Vietcong guerillas and of the American forces, \"A plague on both your houses.\"\
-  \ \u2026 Stop the bombing, north and south, end search and destroy offensive sweeps,\
-  \ and confine our military action to holding operations on the ground. Bombing the\
-  \ north has failed to halt or seriously check the flow of troops to the south and\
-  \ may, in fact, have prompted a much greater war effort by Hanoi.\n\u2014Senator\
-  \ George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following\
-  \ opinions from the 1960s most directly reflects the perspective of George McGovern's\
-  \ speech?\n(A) Americans must maximize their technological edge in Vietnam. (B)\
-  \ American bombing in Vietnam is step by step leading to progress in the war. (C)\
-  \ American bombing in Vietnam is a failure. (D) America must not give in to defeatism\
-  \ about the war in Vietnam.\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on us history for help. \"Stop the bombing\" and \"Bombing the north\
-  \ has failed to halt or seriously check the flow of troops to the south\" indicate\
-  \ that the perspective of George McGovern's speech is that Amerian bombing in Vietnam\
-  \ is a failure. The answer is (C).\n\nQ: This question refers to the following information.\n\
-  \"In the new Code of Laws which I suppose it will be necessary for you to make I\
-  \ desire you would Remember the Ladies, and be more generous and favorable to them\
-  \ than your ancestors. Do not put such unlimited power into the hands of the Husbands.\
-  \ Remember all Men would be tyrants if they could. If particular care and attention\
-  \ is not paid to the Ladies we are determined to foment a Rebellion, and will not\
-  \ hold ourselves bound by any Laws in which we have no voice, or Representation.\"\
-  \nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for woman\
-  \ has placed us in a most anomalous position. Women invested with the rights of\
-  \ citizens in one section\u2014voters, jurors, office-holders\u2014crossing an imaginary\
-  \ line, are subjects in the next. In some States, a married woman may hold property\
-  \ and transact business in her own name; in others, her earnings belong to her husband.\
-  \ In some States, a woman may testify against her husband, sue and be sued in the\
-  \ courts; in others, she has no redress in case of damage to person, property, or\
-  \ character. In case of divorce on account of adultery in the husband, the innocent\
-  \ wife is held to possess no right to children or property, unless by special decree\
-  \ of the court. But in no State of the Union has the wife the right to her own person,\
-  \ or to any part of the joint earnings of the co-partnership during the life of\
-  \ her husband. In some States women may enter the law schools and practice in the\
-  \ courts; in others they are forbidden. In some universities girls enjoy equal educational\
-  \ advantages with boys, while many of the proudest institutions in the land deny\
-  \ them admittance, though the sons of China, Japan and Africa are welcomed there.\
-  \ But the privileges already granted in the several States are by no means secure.\"\
-  \nSusan B. Anthony, \"Declaration of Rights for Women,\" July 4, 1876\nThe sentiments\
-  \ expressed in the second excerpt by Susan B. Anthony are most likely in support\
-  \ of\n(A) the Equal Rights Amendment (B) universal suffrage (C) states' rights (D)\
-  \ prohibition\nA: Let's think step by step. We refer to Wikipedia articles on us\
-  \ history for help. The above information mentioned that women are in an anomalous\
-  \ position in terms of legislation. Women's earnings do not belong to themselves,\
-  \ or they cannot testify against her husbands. Susan believes women should have\
-  \ equal legal rights as men. The answer is (B).\n\nQ: This question refers to the\
-  \ following information.\n\"Society in every state is a blessing, but government\
-  \ even in its best state is but a necessary evil; in its worst state an intolerable\
-  \ one; for when we suffer, or are exposed to the same miseries by a government,\
-  \ which we might expect in a country without government, our calamity is heightened\
-  \ by reflecting that we furnish the means by which we suffer. Government, like dress,\
-  \ is the badge of lost innocence; the palaces of kings are built on the ruins of\
-  \ the bowers of paradise. For were the impulses of conscience clear, uniform, and\
-  \ irresistibly obeyed, man would need no other lawgiver; but that not being the\
-  \ case, he finds it necessary to surrender up a part of his property to furnish\
-  \ means for the protection of the rest; and this he is induced to do by the same\
-  \ prudence which in every other case advises him out of two evils to choose the\
-  \ least. Wherefore, security being the true design and end of government, it unanswerably\
-  \ follows that whatever form thereof appears most likely to ensure it to us, with\
-  \ the least expense and greatest benefit, is preferable to all others.\"\nThomas\
-  \ Paine, Common Sense, 1776\nWhich of the following \"miseries\" alluded to above\
-  \ were most condemned by Anti-Federalists of the post-Revolutionary era?\n(A) Organized\
-  \ response to Bacon's Rebellion (B) Federal response to Shays's Rebellion (C) Federal\
-  \ response to the Whiskey Rebellion (D) Federal response to Pontiac's Rebellion\n\
-  A: Let's think step by step. We refer to Wikipedia articles on us history for help.\
-  \ Anti-Federalists do not believe centralized government power, and suspect Washington's\
-  \ military response to Whiskey Rebellion. Bacon's Rebellion and Pontiac's Rebellion\
-  \ happen before the Revolution and they can be ruled out. The answer is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_us_history
+  \ private dwellings, and in pens and cabins—shut out, cut off from all healing influences,\
+  \ from all mind-restoring cares.… Could their melancholy histories be spread before\
+  \ you as revealed to my grieved spirit during the last three months, how promptly,\
+  \ how earnestly would you search out the most approved means of relief; how trifling,\
+  \ how insignificant, by comparison, would appear the sacrifices you are asked to\
+  \ make; how would a few dimes and dollars, gathered from each citizen, diminish\
+  \ in value as a possession, compared with the certain benefits and vast good to\
+  \ be secured for the suffering insane...by the consecration and application of a\
+  \ sufficient fund to the construction of a suitable hospital.…\n—Dorothea Dix, Memorial\
+  \ Soliciting a State Hospital for the Protection and Cure of the Insane,\nSubmitted\
+  \ to the General Assembly of North Carolina, November 1848\nDorothea Dix can best\
+  \ be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C) Shirley Temple (D)\
+  \ Hillary Clinton\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ us history for help. Both Dorothea Dix and Clara barton are American nurses. The\
+  \ answer is (B).\n\nQ: This question refers to the following information.\n\"As\
+  \ our late Conduct at the Conestoga Manor and Lancaster have occasioned much Speculation\
+  \ & a great diversity of Sentiments in this and neighboring Governments; some vindicating\
+  \ & others condemning it; some charitably alleviating the Crime, & others maliciously\
+  \ painting it in the most odious & detestable Colours, we think it our duty to lay\
+  \ before the Publick, the whole Matter as it appeared, & still appears, to us. .\
+  \ . .\n\"If these things are not sufficient to prove an unjustifiable Attachment\
+  \ in the Quakers to the Indians Savages, a fixed Resolution to befriend them & an\
+  \ utter insensibility to human Distresses, let us consider a few more recent Facts.\
+  \ When we found the last Summer that we were likely to get no Assistance from the\
+  \ Government, some Volunteers went out at our own Expense, determined to drive our\
+  \ Enemies from our Borders; & when we came near to the great Island, we understood\
+  \ that a Number of their Warriors had gone out against our Frontiers. Upon this\
+  \ we returned and came up with them and fought with them at the Munfey Hill where\
+  \ we lost some of our Men & killed some of their Warriors & thereby saved our Frontiers\
+  \ from this Story in another Expedition. But no sooner had we destroyed their Provisions\
+  \ on the great Island, & ruined their trade with the good People at Bethlehem, but\
+  \ these very Indians, who were justly suspected of having murdered our Friends in\
+  \ Northampton County, were by the Influence of some Quakers taken under the Protection\
+  \ of the Government to screen them from the Resentments of the Friends and Relations\
+  \ of the Murdered, & to support them thro the Winter.\"\n—\"Apology of the Paxton\
+  \ Boys\" (pamphlet), 1764 (Note: \"apology\" in this context should be read as an\
+  \ explanation, not an admission of guilt or regret.\nThe sentiments expressed in\
+  \ the explanation above reflect which of the ongoing tensions during the colonial\
+  \ period of American history?\n(A) Tensions between British policies and the aspirations\
+  \ of North American colonists. (B) Tensions between American Indians allied with\
+  \ the French and those allied with the British. (C) Tensions between freed African\
+  \ Americans and white planters. (D) Tensions between backcountry settlers and elites\
+  \ within colonial America.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us history for help. After the French and Indian War, the Scotch-Irish settlers\
+  \ attacked American Indians. After the attacks on the Conestoga, about 250 Paxton\
+  \ Boys present their grievances to the Pennsylvania legislature. As mentioned in\
+  \ the information, the Paxton Boys cited resentiment at local elites. The answer\
+  \ is (D).\n\nQ: This question refers to the following information.\nOur leaders\
+  \ talk about stopping aggression from the north, but this was a struggle among groups\
+  \ of Vietnamese until we intervened. We seem bent upon saving the Vietnamese from\
+  \ Ho Chi Minh even if we have to kill them and demolish their country to do it.\
+  \ As the native people survey bombed-out villages, women and children burned by\
+  \ napalm, rice crops destroyed and cities overrun with our military personnel, they\
+  \ are doubtless saying secretly of the Vietcong guerillas and of the American forces,\
+  \ \"A plague on both your houses.\" … Stop the bombing, north and south, end search\
+  \ and destroy offensive sweeps, and confine our military action to holding operations\
+  \ on the ground. Bombing the north has failed to halt or seriously check the flow\
+  \ of troops to the south and may, in fact, have prompted a much greater war effort\
+  \ by Hanoi.\n—Senator George McGovern, \"The Lessons of Vietnam,\" April 25, 1967\n\
+  Which of the following opinions from the 1960s most directly reflects the perspective\
+  \ of George McGovern's speech?\n(A) Americans must maximize their technological\
+  \ edge in Vietnam. (B) American bombing in Vietnam is step by step leading to progress\
+  \ in the war. (C) American bombing in Vietnam is a failure. (D) America must not\
+  \ give in to defeatism about the war in Vietnam.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on us history for help. \"Stop the bombing\" and \"\
+  Bombing the north has failed to halt or seriously check the flow of troops to the\
+  \ south\" indicate that the perspective of George McGovern's speech is that Amerian\
+  \ bombing in Vietnam is a failure. The answer is (C).\n\nQ: This question refers\
+  \ to the following information.\n\"In the new Code of Laws which I suppose it will\
+  \ be necessary for you to make I desire you would Remember the Ladies, and be more\
+  \ generous and favorable to them than your ancestors. Do not put such unlimited\
+  \ power into the hands of the Husbands. Remember all Men would be tyrants if they\
+  \ could. If particular care and attention is not paid to the Ladies we are determined\
+  \ to foment a Rebellion, and will not hold ourselves bound by any Laws in which\
+  \ we have no voice, or Representation.\"\nAbigail Adams, in a letter to John Adams,\
+  \ 1776\n\"Special legislation for woman has placed us in a most anomalous position.\
+  \ Women invested with the rights of citizens in one section—voters, jurors, office-holders—crossing\
+  \ an imaginary line, are subjects in the next. In some States, a married woman may\
+  \ hold property and transact business in her own name; in others, her earnings belong\
+  \ to her husband. In some States, a woman may testify against her husband, sue and\
+  \ be sued in the courts; in others, she has no redress in case of damage to person,\
+  \ property, or character. In case of divorce on account of adultery in the husband,\
+  \ the innocent wife is held to possess no right to children or property, unless\
+  \ by special decree of the court. But in no State of the Union has the wife the\
+  \ right to her own person, or to any part of the joint earnings of the co-partnership\
+  \ during the life of her husband. In some States women may enter the law schools\
+  \ and practice in the courts; in others they are forbidden. In some universities\
+  \ girls enjoy equal educational advantages with boys, while many of the proudest\
+  \ institutions in the land deny them admittance, though the sons of China, Japan\
+  \ and Africa are welcomed there. But the privileges already granted in the several\
+  \ States are by no means secure.\"\nSusan B. Anthony, \"Declaration of Rights for\
+  \ Women,\" July 4, 1876\nThe sentiments expressed in the second excerpt by Susan\
+  \ B. Anthony are most likely in support of\n(A) the Equal Rights Amendment (B) universal\
+  \ suffrage (C) states' rights (D) prohibition\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on us history for help. The above information mentioned\
+  \ that women are in an anomalous position in terms of legislation. Women's earnings\
+  \ do not belong to themselves, or they cannot testify against her husbands. Susan\
+  \ believes women should have equal legal rights as men. The answer is (B).\n\nQ:\
+  \ This question refers to the following information.\n\"Society in every state is\
+  \ a blessing, but government even in its best state is but a necessary evil; in\
+  \ its worst state an intolerable one; for when we suffer, or are exposed to the\
+  \ same miseries by a government, which we might expect in a country without government,\
+  \ our calamity is heightened by reflecting that we furnish the means by which we\
+  \ suffer. Government, like dress, is the badge of lost innocence; the palaces of\
+  \ kings are built on the ruins of the bowers of paradise. For were the impulses\
+  \ of conscience clear, uniform, and irresistibly obeyed, man would need no other\
+  \ lawgiver; but that not being the case, he finds it necessary to surrender up a\
+  \ part of his property to furnish means for the protection of the rest; and this\
+  \ he is induced to do by the same prudence which in every other case advises him\
+  \ out of two evils to choose the least. Wherefore, security being the true design\
+  \ and end of government, it unanswerably follows that whatever form thereof appears\
+  \ most likely to ensure it to us, with the least expense and greatest benefit, is\
+  \ preferable to all others.\"\nThomas Paine, Common Sense, 1776\nWhich of the following\
+  \ \"miseries\" alluded to above were most condemned by Anti-Federalists of the post-Revolutionary\
+  \ era?\n(A) Organized response to Bacon's Rebellion (B) Federal response to Shays's\
+  \ Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response to\
+  \ Pontiac's Rebellion\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us history for help. Anti-Federalists do not believe centralized government\
+  \ power, and suspect Washington's military response to Whiskey Rebellion. Bacon's\
+  \ Rebellion and Pontiac's Rebellion happen before the Revolution and they can be\
+  \ ruled out. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
index 1cf4bbdb9e95c03b7550894a31eb209c89836e7c..6db82ea64dada9c748c4a221a67e2c37092ab2a4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
@@ -1,5 +1,5 @@
-dataset_name: high_school_world_history
-description: "The following are multiple choice questions (with answers) about high\
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\nQ: This question refers to the following information.\n\
   \"At least one of the [world's] societies would have to somehow enormously increase\
   \ its productivity [in order to achieve global hegemony]. That quantum jump would\
@@ -8,75 +8,75 @@ description: "The following are multiple choice questions (with answers) about h
   \ be accomplished by exploiting the ecosystems, mineral resources, and human assets\
   \ of whole continents outside the lands of the society making the jump. Western\
   \ Europe did just that by means of its brutality and guns and, more important, by\
-  \ geographical and ecological luck.\"\nCopyright \xA9 2015 Cambridge University\
-  \ Press.\nAlfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum\
-  \ jump\" mentioned in the passage most directly contributed to which of the following\
-  \ developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade routes\
-  \ through the collapse of the established state structure (B) An increase in the\
-  \ population of the world through more plentiful supplies of food (C) The spread\
-  \ of Chinese and Indian belief systems across the world (D) An increase in social\
-  \ unrest\nA: Let's think step by step. We refer to Wikipedia articles on world history\
-  \ for help. The \"quantum jump\" mentioned in the passage refers to the conquest\
-  \ of the New World and the Columbian Exchange. Choice (A) and (C) did not happen\
-  \ in history. Choice (C) refers to the human assets. The answer is (B).\n\nQ: This\
-  \ question refers to the following information.\n\"The struggle against neo-colonialism\
-  \ is not aimed at excluding the capital of the developed world from operating in\
-  \ less developed countries. It is aimed at preventing the financial power of the\
-  \ developed countries being used in such a way as to impoverish the less developed.\n\
-  Non-alignment, as practiced by Ghana and many other countries, is based on co-operation\
-  \ with all States whether they be capitalist, socialist or have a mixed economy.\
-  \ Such a policy, therefore, involves foreign investment from capitalist countries,\
-  \ but it must be invested in accordance with a national plan drawn up by the government\
-  \ of the non-aligned State with its own interests in mind. The issue is not what\
-  \ return the foreign investor receives on his investments\u2026The question is one\
-  \ of power. A State in the grip of neo-colonialism is not master of its own destiny.\"\
-  \nKwame Nkrumah, Neo-Colonialism, 1965\nWhich of the following provides the best\
-  \ context for Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization\
-  \ (C) Regional Free Trade Associations (D) Autarky\nA: Let's think step by step.\
-  \ We refer to Wikipedia articles on world history for help. The passage expresses\
-  \ a point that the successful fight against neo-colonialism were in danger and the\
-  \ newly independent nations like Ghana may be re-colonized via financial power of\
-  \ the developed countries. The answer is (B).\n\nQ: This question refers to the\
-  \ following information.\n\"Indeed, as both the fatwas of distinguished [scholars]\
-  \ who base their opinion on reason and tradition alike and the consensus of the\
-  \ Sunni community agree that the ancient obligation of extirpation, extermination,\
-  \ and expulsion of evil innovation must be the aim of our exalted aspiration, for\
-  \ \"Religious zeal is a victory for the Faith of God the Beneficent\"; then, in\
-  \ accordance with the words of the Prophet (Peace upon him!) \"Whosoever introduces\
-  \ evil innovation into our order must be expelled\" and \"Whosoever does aught against\
-  \ our order must be expelled,\" action has become necessary and exigent\u2026\"\n\
-  Letter from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from\
-  \ Selim I is most clearly an example of which of the following?\n(A) The maintenance\
-  \ of military supremacy at all costs (B) Expanding tensions between religious sects\
-  \ (C) Factors that brought about the collapse of the Ottoman Empire (D) Peacemaking\
-  \ efforts among the Islamic empires\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on world history for help. The passage is an example of expanding tensions\
-  \ between Selim and Ismail. In the passage the Selim references the fatwa and the\
-  \ consensus of the Sunni community to against whosoever introduces evil. The answer\
-  \ is (B).\n\nQ: This question refers to the following information.\n\"The real grievance\
-  \ of the worker is the insecurity of his existence; he is not sure that he will\
-  \ always have work, he is not sure that he will always be healthy, and he foresees\
-  \ that he will one day be old and unfit to work. If he falls into poverty, even\
-  \ if only through a prolonged illness, he is then completely helpless, exam_ins\
-  \ to his own devices, and society does not currently recognize any real obligation\
-  \ towards him beyond the usual help for the poor, even if he has been working all\
-  \ the time ever so faithfully and diligently. The usual help for the poor, however,\
-  \ leaves a lot to be desired, especially in large cities, where it is very much\
-  \ worse than in the country.\"\nOtto von Bismarck, 1884\nOtto von Bismarck likely\
-  \ made this speech in reaction to which of the following issues?\n(A) Social acceptance\
-  \ of child labor (B) Declining life expectancy in Germany (C) Criticisms of German\
-  \ trade tariffs (D) Negative effects attributed to industrial capitalism\nA: Let's\
-  \ think step by step. We refer to Wikipedia articles on world history for help.\
-  \ The passage talks about the grievance of the work under the industrial capitalism.\
-  \ The answer is (D).\n\nQ: This question refers to the following information.\n\
-  He contains all works and desires and all perfumes and all tastes. He enfolds the\
-  \ whole universe and in silence is loving to all. This is the Spirit that is in\
-  \ my heart, this is Brahman. To him I shall come when I go beyond this life, and\
-  \ to him will come he who has faith and doubts not.\n\u2014The Upanishads, India,\
-  \ c. 1000 BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism\
-  \ (B) Buddhism (C) Shintoism (D) Zoroastrianism\nA: Let's think step by step. We\
-  \ refer to Wikipedia articles on world history for help. Brahman refers to the ultimate\
-  \ reality of all things in the Hindu religion. In contrast, Buddhism does not have\
-  \ a concept of supreme God. The answer is (A)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_high_school_world_history
+  \ geographical and ecological luck.\"\nCopyright © 2015 Cambridge University Press.\n\
+  Alfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\" mentioned\
+  \ in the passage most directly contributed to which of the following developments\
+  \ in the period 1450–1750 C.E.?\n(A) A breakdown in trade routes through the collapse\
+  \ of the established state structure (B) An increase in the population of the world\
+  \ through more plentiful supplies of food (C) The spread of Chinese and Indian belief\
+  \ systems across the world (D) An increase in social unrest\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on world history for help. The \"quantum\
+  \ jump\" mentioned in the passage refers to the conquest of the New World and the\
+  \ Columbian Exchange. Choice (A) and (C) did not happen in history. Choice (C) refers\
+  \ to the human assets. The answer is (B).\n\nQ: This question refers to the following\
+  \ information.\n\"The struggle against neo-colonialism is not aimed at excluding\
+  \ the capital of the developed world from operating in less developed countries.\
+  \ It is aimed at preventing the financial power of the developed countries being\
+  \ used in such a way as to impoverish the less developed.\nNon-alignment, as practiced\
+  \ by Ghana and many other countries, is based on co-operation with all States whether\
+  \ they be capitalist, socialist or have a mixed economy. Such a policy, therefore,\
+  \ involves foreign investment from capitalist countries, but it must be invested\
+  \ in accordance with a national plan drawn up by the government of the non-aligned\
+  \ State with its own interests in mind. The issue is not what return the foreign\
+  \ investor receives on his investments…The question is one of power. A State in\
+  \ the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah,\
+  \ Neo-Colonialism, 1965\nWhich of the following provides the best context for Nkrumah's\
+  \ writings?\n(A) The Industrial Revolution (B) Decolonization (C) Regional Free\
+  \ Trade Associations (D) Autarky\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on world history for help. The passage expresses a point that the successful\
+  \ fight against neo-colonialism were in danger and the newly independent nations\
+  \ like Ghana may be re-colonized via financial power of the developed countries.\
+  \ The answer is (B).\n\nQ: This question refers to the following information.\n\"\
+  Indeed, as both the fatwas of distinguished [scholars] who base their opinion on\
+  \ reason and tradition alike and the consensus of the Sunni community agree that\
+  \ the ancient obligation of extirpation, extermination, and expulsion of evil innovation\
+  \ must be the aim of our exalted aspiration, for \"Religious zeal is a victory for\
+  \ the Faith of God the Beneficent\"; then, in accordance with the words of the Prophet\
+  \ (Peace upon him!) \"Whosoever introduces evil innovation into our order must be\
+  \ expelled\" and \"Whosoever does aught against our order must be expelled,\" action\
+  \ has become necessary and exigent…\"\nLetter from Ottoman Sultan Selim I to Safavid\
+  \ Shah Ismail I, 1514\nThe letter from Selim I is most clearly an example of which\
+  \ of the following?\n(A) The maintenance of military supremacy at all costs (B)\
+  \ Expanding tensions between religious sects (C) Factors that brought about the\
+  \ collapse of the Ottoman Empire (D) Peacemaking efforts among the Islamic empires\n\
+  A: Let's think step by step. We refer to Wikipedia articles on world history for\
+  \ help. The passage is an example of expanding tensions between Selim and Ismail.\
+  \ In the passage the Selim references the fatwa and the consensus of the Sunni community\
+  \ to against whosoever introduces evil. The answer is (B).\n\nQ: This question refers\
+  \ to the following information.\n\"The real grievance of the worker is the insecurity\
+  \ of his existence; he is not sure that he will always have work, he is not sure\
+  \ that he will always be healthy, and he foresees that he will one day be old and\
+  \ unfit to work. If he falls into poverty, even if only through a prolonged illness,\
+  \ he is then completely helpless, exam_ins to his own devices, and society does\
+  \ not currently recognize any real obligation towards him beyond the usual help\
+  \ for the poor, even if he has been working all the time ever so faithfully and\
+  \ diligently. The usual help for the poor, however, leaves a lot to be desired,\
+  \ especially in large cities, where it is very much worse than in the country.\"\
+  \nOtto von Bismarck, 1884\nOtto von Bismarck likely made this speech in reaction\
+  \ to which of the following issues?\n(A) Social acceptance of child labor (B) Declining\
+  \ life expectancy in Germany (C) Criticisms of German trade tariffs (D) Negative\
+  \ effects attributed to industrial capitalism\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on world history for help. The passage talks about the grievance\
+  \ of the work under the industrial capitalism. The answer is (D).\n\nQ: This question\
+  \ refers to the following information.\nHe contains all works and desires and all\
+  \ perfumes and all tastes. He enfolds the whole universe and in silence is loving\
+  \ to all. This is the Spirit that is in my heart, this is Brahman. To him I shall\
+  \ come when I go beyond this life, and to him will come he who has faith and doubts\
+  \ not.\n—The Upanishads, India, c. 1000 BCE\nTo which religion does the speaker\
+  \ most likely belong?\n(A) Hinduism (B) Buddhism (C) Shintoism (D) Zoroastrianism\n\
+  A: Let's think step by step. We refer to Wikipedia articles on world history for\
+  \ help. Brahman refers to the ultimate reality of all things in the Hindu religion.\
+  \ In contrast, Buddhism does not have a concept of supreme God. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
index 9d6521325e62cd74ca812c6a1f138bd2328f5bf0..3d1f5971d8c9c8cc559020a12111b6c649a4d998 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
@@ -1,48 +1,28 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
-
-
-  Q: All other things being equal, which of the following persons is more likely to
-  show osteoporosis?
-
-  (A) An older Hispanic American woman (B) An older African American woman (C) An
-  older Asian American woman (D) An older Native American woman
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Although osteoporosis can occur at any age, the risk is higher for older people.
-  It is most common in Asian and non-Hispanic white women. The answer is (C).
-
-
-  Q: The finding that adults tend to remember events from their adolescence better
-  than from other periods in their lives is referred to as the
-
-  (A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic
-  retrieval spike
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Reminiscence bump is a phenomenon that older adults tend to recollect events
-  during their young ages. People usually have a period of childhood amnesia from
-  birth to around age 5, and a reminiscence bump between 10 and 30. The answer is
-  (B).
-
-
-  Q: Which element in tobacco smoke is responsible for cancers?
-
-  (A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. The benzene, acrylamide and acrylonitrile in tar interact with the lungs and
-  cause DNA mutations in cells of the lungs, and lead to cancer. The answer is (B).
-
-
-  Q: When older adults move to a new state after retirement, which of the following
-  is the more likely destination?
-
-  (A) Texas (B) California (C) Hawaii (D) Vermont
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human aging for
-  help. Texas does not have state tax, and has low cost of living compared with the
-  other three options. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_human_aging
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\nQ: All other things being equal, which of the following persons is more\
+  \ likely to show osteoporosis?\n(A) An older Hispanic American woman (B) An older\
+  \ African American woman (C) An older Asian American woman (D) An older Native American\
+  \ woman\nA: Let's think step by step. We refer to Wikipedia articles on human aging\
+  \ for help. Although osteoporosis can occur at any age, the risk is higher for older\
+  \ people. It is most common in Asian and non-Hispanic white women. The answer is\
+  \ (C).\n\nQ: The finding that adults tend to remember events from their adolescence\
+  \ better than from other periods in their lives is referred to as the\n(A) Adolescence\
+  \ advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic retrieval\
+  \ spike\nA: Let's think step by step. We refer to Wikipedia articles on human aging\
+  \ for help. Reminiscence bump is a phenomenon that older adults tend to recollect\
+  \ events during their young ages. People usually have a period of childhood amnesia\
+  \ from birth to around age 5, and a reminiscence bump between 10 and 30. The answer\
+  \ is (B).\n\nQ: Which element in tobacco smoke is responsible for cancers?\n(A)\
+  \ Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on human aging for help. The benzene,\
+  \ acrylamide and acrylonitrile in tar interact with the lungs and cause DNA mutations\
+  \ in cells of the lungs, and lead to cancer. The answer is (B).\n\nQ: When older\
+  \ adults move to a new state after retirement, which of the following is the more\
+  \ likely destination?\n(A) Texas (B) California (C) Hawaii (D) Vermont\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on human aging for help. Texas\
+  \ does not have state tax, and has low cost of living compared with the other three\
+  \ options. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
index 6b7a12cc830f08457047b0ba45c4a346fae8813f..68a840928914dddf6a3517a192785df5ab9fc61d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
@@ -1,61 +1,34 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
-
-
-  Q: The most common disorder among men who seek sexual therapy is:
-
-  (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory
-  disorder
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. The most common type of arousal disorder for men is erectile dysfunction,
-  meaning that a person is not able to get any physical satisfaction from sexual activity
-  although he may be interested in it. The answer is (C).
-
-
-  Q: A woman who knows she has active herpes and untreated syphilis but continues
-  to have sex without informing her partners of her condition has, in psychoanalytic
-  terms:
-
-  (A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. A person with weak superego tends to be delinquent, criminal or have antisocial
-  personality. The action of the woman who knows she has active venereal disease but
-  still have sex with her partners indicate she may has antisocial personality. The
-  answer is (B).
-
-
-  Q: The nature of homosexual activities that occur during preadolescence include
-  all but which of the following?
-
-  (A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each other''s
-  genitals
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. Sexual intercourse is prohibited during preadolescence. The answer is
-  (A).
-
-
-  Q: Women''s ability to have multiple orgasms is primarily due to:
-
-  (A) the fact that they do not have a refractory period. (B) the response of the
-  inner layers of the vagina. (C) having alternating orgasms in different locations.
-  (D) the G-Spot.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. The refractory period is the time when a person is not able to be erect
-  or is not interested in sex. The answer is (A).
-
-
-  Q: Morning sickness is typically a problem:
-
-  (A) during the first trimester (B) during the second trimester (C) during the third
-  trimester (D) all through the pregnancy
-
-  A: Let''s think step by step. We refer to Wikipedia articles on human sexuality
-  for help. Morning sickness usually begins by nine weeks after conception, corresponding
-  to the first trimester. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_human_sexuality
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\nQ: The most common disorder among men who seek sexual therapy is:\n\
+  (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) ejaculatory\
+  \ disorder\nA: Let's think step by step. We refer to Wikipedia articles on human\
+  \ sexuality for help. The most common type of arousal disorder for men is erectile\
+  \ dysfunction, meaning that a person is not able to get any physical satisfaction\
+  \ from sexual activity although he may be interested in it. The answer is (C).\n\
+  \nQ: A woman who knows she has active herpes and untreated syphilis but continues\
+  \ to have sex without informing her partners of her condition has, in psychoanalytic\
+  \ terms:\n(A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego\n\
+  A: Let's think step by step. We refer to Wikipedia articles on human sexuality for\
+  \ help. A person with weak superego tends to be delinquent, criminal or have antisocial\
+  \ personality. The action of the woman who knows she has active venereal disease\
+  \ but still have sex with her partners indicate she may has antisocial personality.\
+  \ The answer is (B).\n\nQ: The nature of homosexual activities that occur during\
+  \ preadolescence include all but which of the following?\n(A) sexual intercourse\
+  \ (B) circle jerks (C) exhibitionism (D) touching each other's genitals\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on human sexuality for help.\
+  \ Sexual intercourse is prohibited during preadolescence. The answer is (A).\n\n\
+  Q: Women's ability to have multiple orgasms is primarily due to:\n(A) the fact that\
+  \ they do not have a refractory period. (B) the response of the inner layers of\
+  \ the vagina. (C) having alternating orgasms in different locations. (D) the G-Spot.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on human sexuality for\
+  \ help. The refractory period is the time when a person is not able to be erect\
+  \ or is not interested in sex. The answer is (A).\n\nQ: Morning sickness is typically\
+  \ a problem:\n(A) during the first trimester (B) during the second trimester (C)\
+  \ during the third trimester (D) all through the pregnancy\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on human sexuality for help. Morning sickness\
+  \ usually begins by nine weeks after conception, corresponding to the first trimester.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
index 655a39e8ae3d85d29841d7940270f04137fff0f8..31d8766740ac34c2d474798f97bd0623cce2f045 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
@@ -1,80 +1,54 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
-
-
-  Q: How the consent to be bound of a State may be expressed?
-
-  (A) The consent of a State to be bound is expressed only by ratification (B) The
-  consent of a state to be bound by a treaty may be expressed by signature, ratification,
-  acceptance, approval or accession (C) The consent of a State to be bound is expressed
-  by signature (D) The consent of a State to be bound is expressed by whatever means
-  they choose
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969
-  states that "the consent of a State to be bound by a treaty may be expressed by
-  signature, exchange of instruments constituting a treaty, ratification, acceptance,
-  approval or accession, or by any other means if so agreed." (B) is the most precise
-  and accurate answer. The answer is (B).
-
-
-  Q: What is the judge ad hoc?
-
-  (A) If a party to a contentious case before the ICJ does not have a national sitting
-  as judge, it is entitled to nominate someone as a judge solely for that case, with
-  the title of judge ad hoc (B) Judge ad hoc is the member of the bench of the ICJ
-  with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge is disqualified
-  or passes away (D) Judge ad hoc is the judge that each party will always nominate
-  in every contentious case
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. As "ad hoc" implies, a judge ad hoc is appointed only for a specific case
-  or period, when a party to a contentious case before the International Court of
-  Justice does not have a regular national sitting as judge. The answer is (A).
-
-
-  Q: When ''consent'' can serve as a circumstance precluding the wrongfulness of a
-  State conduct?
-
-  (A) Consent can serve as a circumstance precluding the wrongfulness whenever it
-  is given (B) Consent can never serve as a circumstance precluding wrongfulness (C)
-  Consent can serve as a circumstance precluding wrongfulness, provided the consent
-  is valid and to the extent that the conduct remains within the limits of the consent
-  given (D) Consent can always serve as a circumstance precluding wrongfulness, no
-  matter which organ of the State gives it
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Valid consent can serve as a circumstance precluding the wrongfulness
-  of a State conduct if the conduct remains within the limits of that consent, according
-  to Chapter V of the Responsibility of States for Internationally Wrongful Acts,
-  2001, United Nations. The answer is (C).
-
-
-  Q: Would a reservation to the definition of torture in the ICCPR be acceptable in
-  contemporary practice?
-
-  (A) This is an acceptable reservation if the reserving country''s legislation employs
-  a different definition (B) This is an unacceptable reservation because it contravenes
-  the object and purpose of the ICCPR (C) This is an unacceptable reservation because
-  the definition of torture in the ICCPR is consistent with customary international
-  law (D) This is an acceptable reservation because under general international law
-  States have the right to enter reservations to treaties
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. For it contravenes the object and purpose of the ICCPR, this is an unacceptable
-  reservation in contemporary practice. The answer is (B).
-
-
-  Q: What types of force does Article 2(4) of the UN Charter prohibit?
-
-  (A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all types
-  of force, including sanctions (C) Article 2(4) encompasses all interference in the
-  domestic affairs of States (D) Article 2(4) encompasses force directed only against
-  a State''s territorial integrity
-
-  A: Let''s think step by step. We refer to Wikipedia articles on international law
-  for help. Article 2(4) of the UN Charter prohibits states from using armed forces
-  in their international relations. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_international_law
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\nQ: How the consent to be bound of a State may be expressed?\n(A) The consent\
+  \ of a State to be bound is expressed only by ratification (B) The consent of a\
+  \ state to be bound by a treaty may be expressed by signature, ratification, acceptance,\
+  \ approval or accession (C) The consent of a State to be bound is expressed by signature\
+  \ (D) The consent of a State to be bound is expressed by whatever means they choose\n\
+  A: Let's think step by step. We refer to Wikipedia articles on international law\
+  \ for help. Article 11 of Vienna Convention on the Law of Treaties signed in 1969\
+  \ states that \"the consent of a State to be bound by a treaty may be expressed\
+  \ by signature, exchange of instruments constituting a treaty, ratification, acceptance,\
+  \ approval or accession, or by any other means if so agreed.\" (B) is the most precise\
+  \ and accurate answer. The answer is (B).\n\nQ: What is the judge ad hoc?\n(A) If\
+  \ a party to a contentious case before the ICJ does not have a national sitting\
+  \ as judge, it is entitled to nominate someone as a judge solely for that case,\
+  \ with the title of judge ad hoc (B) Judge ad hoc is the member of the bench of\
+  \ the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case a judge\
+  \ is disqualified or passes away (D) Judge ad hoc is the judge that each party will\
+  \ always nominate in every contentious case\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on international law for help. As \"ad hoc\" implies, a\
+  \ judge ad hoc is appointed only for a specific case or period, when a party to\
+  \ a contentious case before the International Court of Justice does not have a regular\
+  \ national sitting as judge. The answer is (A).\n\nQ: When 'consent' can serve as\
+  \ a circumstance precluding the wrongfulness of a State conduct?\n(A) Consent can\
+  \ serve as a circumstance precluding the wrongfulness whenever it is given (B) Consent\
+  \ can never serve as a circumstance precluding wrongfulness (C) Consent can serve\
+  \ as a circumstance precluding wrongfulness, provided the consent is valid and to\
+  \ the extent that the conduct remains within the limits of the consent given (D)\
+  \ Consent can always serve as a circumstance precluding wrongfulness, no matter\
+  \ which organ of the State gives it\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on international law for help. Valid consent can serve as a circumstance\
+  \ precluding the wrongfulness of a State conduct if the conduct remains within the\
+  \ limits of that consent, according to Chapter V of the Responsibility of States\
+  \ for Internationally Wrongful Acts, 2001, United Nations. The answer is (C).\n\n\
+  Q: Would a reservation to the definition of torture in the ICCPR be acceptable in\
+  \ contemporary practice?\n(A) This is an acceptable reservation if the reserving\
+  \ country's legislation employs a different definition (B) This is an unacceptable\
+  \ reservation because it contravenes the object and purpose of the ICCPR (C) This\
+  \ is an unacceptable reservation because the definition of torture in the ICCPR\
+  \ is consistent with customary international law (D) This is an acceptable reservation\
+  \ because under general international law States have the right to enter reservations\
+  \ to treaties\nA: Let's think step by step. We refer to Wikipedia articles on international\
+  \ law for help. For it contravenes the object and purpose of the ICCPR, this is\
+  \ an unacceptable reservation in contemporary practice. The answer is (B).\n\nQ:\
+  \ What types of force does Article 2(4) of the UN Charter prohibit?\n(A) Article\
+  \ 2(4) encompasses only armed force (B) Article 2(4) encompasses all types of force,\
+  \ including sanctions (C) Article 2(4) encompasses all interference in the domestic\
+  \ affairs of States (D) Article 2(4) encompasses force directed only against a State's\
+  \ territorial integrity\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on international law for help. Article 2(4) of the UN Charter prohibits states\
+  \ from using armed forces in their international relations. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
index 7e11f0f7bb813e9a731137d54de2e82ed4c578e7..fa354238e46fce2ff4f523bf348ab3cfc0ade580 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
@@ -1,69 +1,45 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  Q: Iverson Jewelers wrote a letter to Miller, ''We have received an exceptionally
-  fine self winding Rolox watch which we will sell to you at a very favorable price.''
-
-  (A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C)
-  The letter contains a valid offer which will terminate within a reasonable time.
-  (D) The letter lacks one of the essential elements of an offer.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. An offer shows the intent to enter into a mutually-beneficial contract with
-  specific terms. An offer can be made by a letter. While this letter indicates the
-  willingness to sell, the lack of specific terms, such as transaction price and offer
-  expiration date, makes it an incomplete offer. The answer is (D).
-
-
-  Q: Functions of the law include all but which of the following?
-
-  (A) maximizing individual freedom (B) providing a basis for compromise (C) keeping
-  the peace (D) promoting the principles of the free enterprise system
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Laws are fundamentally about helping resolve disputes between individuals,
-  and therefore essential for maximizing individual freedom, providing a basis for
-  compromise, and keeping the peace. The answer is (D).
-
-
-  Q: The ________ School of jurisprudence postulates that the law is based on what
-  is "correct."
-
-  (A) Natural Law (B) Analytical (C) Historical (D) Sociological
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Natural Law School of jurisprudence focuses on the laws of nature, and states
-  that the law should be based on ethics, morals, and what is "correct". Analytical
-  deals with the law as it already exists, Historical postulates that the law was
-  found and not made, and Sociological studies how the law and society impact each
-  other. The answer is (A).
-
-
-  Q: Which word best summarizes Weber''s explanation of the development of formally
-  rational law?
-
-  (A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. Weber explained the development of formal rationality in laws as how the modern
-  society moved from tradition to rationality, where people decide actions based less
-  on how they were culturally done and more on expected utilities. How rational individuals
-  optimize efficiency of accomplishing tasks for higher rewards is a core principle
-  of Capitalism. The answer is (D).
-
-
-  Q: Which position does Rawls claim is the least likely to be adopted by the POP
-  (people in the original position)?
-
-  (A) The POP would choose equality above liberty. (B) The POP would opt for the ''maximin''
-  strategy. (C) The POP would opt for the ''difference principle''. (D) The POP would
-  reject the ''system of natural liberty.''
-
-  A: Let''s think step by step. We refer to Wikipedia articles on jurisprudence for
-  help. The POP would opt for the ''maximin'' strategy, opt for the ''difference principle'',
-  and reject the ''system of natural liberty'', but the POP would not choose equality
-  above liberty, since the POP assume both equal and free citizens. The answer is
-  (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_jurisprudence
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \nQ: Iverson Jewelers wrote a letter to Miller, 'We have received an exceptionally\
+  \ fine self winding Rolox watch which we will sell to you at a very favorable price.'\n\
+  (A) The letter is an offer to sell (B) A valid offer cannot be made by letter. (C)\
+  \ The letter contains a valid offer which will terminate within a reasonable time.\
+  \ (D) The letter lacks one of the essential elements of an offer.\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on jurisprudence for help. An offer\
+  \ shows the intent to enter into a mutually-beneficial contract with specific terms.\
+  \ An offer can be made by a letter. While this letter indicates the willingness\
+  \ to sell, the lack of specific terms, such as transaction price and offer expiration\
+  \ date, makes it an incomplete offer. The answer is (D).\n\nQ: Functions of the\
+  \ law include all but which of the following?\n(A) maximizing individual freedom\
+  \ (B) providing a basis for compromise (C) keeping the peace (D) promoting the principles\
+  \ of the free enterprise system\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on jurisprudence for help. Laws are fundamentally about helping resolve\
+  \ disputes between individuals, and therefore essential for maximizing individual\
+  \ freedom, providing a basis for compromise, and keeping the peace. The answer is\
+  \ (D).\n\nQ: The ________ School of jurisprudence postulates that the law is based\
+  \ on what is \"correct.\"\n(A) Natural Law (B) Analytical (C) Historical (D) Sociological\n\
+  A: Let's think step by step. We refer to Wikipedia articles on jurisprudence for\
+  \ help. Natural Law School of jurisprudence focuses on the laws of nature, and states\
+  \ that the law should be based on ethics, morals, and what is \"correct\". Analytical\
+  \ deals with the law as it already exists, Historical postulates that the law was\
+  \ found and not made, and Sociological studies how the law and society impact each\
+  \ other. The answer is (A).\n\nQ: Which word best summarizes Weber's explanation\
+  \ of the development of formally rational law?\n(A) Authority. (B) Charisma. (C)\
+  \ Co-operation. (D) Capitalism.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on jurisprudence for help. Weber explained the development of formal\
+  \ rationality in laws as how the modern society moved from tradition to rationality,\
+  \ where people decide actions based less on how they were culturally done and more\
+  \ on expected utilities. How rational individuals optimize efficiency of accomplishing\
+  \ tasks for higher rewards is a core principle of Capitalism. The answer is (D).\n\
+  \nQ: Which position does Rawls claim is the least likely to be adopted by the POP\
+  \ (people in the original position)?\n(A) The POP would choose equality above liberty.\
+  \ (B) The POP would opt for the 'maximin' strategy. (C) The POP would opt for the\
+  \ 'difference principle'. (D) The POP would reject the 'system of natural liberty.'\n\
+  A: Let's think step by step. We refer to Wikipedia articles on jurisprudence for\
+  \ help. The POP would opt for the 'maximin' strategy, opt for the 'difference principle',\
+  \ and reject the 'system of natural liberty', but the POP would not choose equality\
+  \ above liberty, since the POP assume both equal and free citizens. The answer is\
+  \ (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
index f6f3c35934ead3c03f9d7a33ae2a801a2bb0d658..c6251e67ee2b250508ad0cc8df3d35f1952c3fe7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
@@ -1,71 +1,45 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
-
-
-  Q: When an arguer causes confusion during refutation because of real or feigned
-  lack of an ability to engage in refutation, that arguer may have committed the fallacy
-  of
-
-  (A) poor sportsmanship (B) appeal to compassion (C) argument against the person
-  (D) ignorance of refutation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. Ignorance of refutation, one of Aristotle''s original list of logical
-  fallacies in his Organon, is when someone causes confusion in an argument through
-  real or feigned inability to engage in refutation, in order to win the argument.
-  The answer is (D).
-
-
-  Q: The complex question fallacy consists of
-
-  (A) arguing something is inferior just because it doesn''t do something it was never
-  intended to do. (B) including more than one claim in the proposition and treating
-  proof for one claim as proof for all the claims. (C) drawing a conclusion before
-  examining the evidence, and only considering evidence that supports that conclusion.
-  (D) asking a question that includes either an unproven assumption or more than one
-  question, thus making a straightforward yes or no answer meaningless.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. The complex question fallacy is when someone makes a single yes or no
-  answer to a question meaningless, by including either an unproven assumption or
-  many questions. The latter is also known as the many questions fallacy. The answer
-  is (D).
-
-
-  Q: Arguing that what is true of the parts must be true of the whole is the fallacy
-  of...
-
-  (A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. Fallacy of composition occurs when someone argues what is true of the
-  parts must be true of the whole. The answer is (B).
-
-
-  Q: Which of the following is true of a valid categorical syllogism?
-
-  (A) The minor premise must deny the antecedent (B) The major premise must affirm
-  the consequent (C) The middle term must be used in at least one premise in a universal
-  or unqualified sense (D) All of the above
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. A valid categorical syllogism must satisfy several conditions: (1) the
-  syllogism must have exactly three terms (2) every term of the syllogism must be
-  used twice exactly, (3) a term may be used only once in any premise, and (4) the
-  middle term must be used in at least one premise in a universal or unqualified sense,
-  etc. Only (C) is true. The answer is (C).
-
-
-  Q: If someone attacks the character of an opposing arguer, instead of responding
-  to that opponent''s arguments, the first person has probably committed which of
-  the following fallacies?
-
-  (A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi
-
-  A: Let''s think step by step. We refer to Wikipedia articles on logical fallacies
-  for help. The argument against the person fallacy occurs when someone irrelevantly
-  attacks the character of an opposing arguer, instead of addressing that opponent''s
-  arguments. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_logical_fallacies
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\nQ: When an arguer causes confusion during refutation because of\
+  \ real or feigned lack of an ability to engage in refutation, that arguer may have\
+  \ committed the fallacy of\n(A) poor sportsmanship (B) appeal to compassion (C)\
+  \ argument against the person (D) ignorance of refutation\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on logical fallacies for help. Ignorance\
+  \ of refutation, one of Aristotle's original list of logical fallacies in his Organon,\
+  \ is when someone causes confusion in an argument through real or feigned inability\
+  \ to engage in refutation, in order to win the argument. The answer is (D).\n\n\
+  Q: The complex question fallacy consists of\n(A) arguing something is inferior just\
+  \ because it doesn't do something it was never intended to do. (B) including more\
+  \ than one claim in the proposition and treating proof for one claim as proof for\
+  \ all the claims. (C) drawing a conclusion before examining the evidence, and only\
+  \ considering evidence that supports that conclusion. (D) asking a question that\
+  \ includes either an unproven assumption or more than one question, thus making\
+  \ a straightforward yes or no answer meaningless.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on logical fallacies for help. The complex question\
+  \ fallacy is when someone makes a single yes or no answer to a question meaningless,\
+  \ by including either an unproven assumption or many questions. The latter is also\
+  \ known as the many questions fallacy. The answer is (D).\n\nQ: Arguing that what\
+  \ is true of the parts must be true of the whole is the fallacy of...\n(A) Division\
+  \ (B) Composition (C) Appeal to the person (D) Appeal to ignorance\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on logical fallacies for help. Fallacy\
+  \ of composition occurs when someone argues what is true of the parts must be true\
+  \ of the whole. The answer is (B).\n\nQ: Which of the following is true of a valid\
+  \ categorical syllogism?\n(A) The minor premise must deny the antecedent (B) The\
+  \ major premise must affirm the consequent (C) The middle term must be used in at\
+  \ least one premise in a universal or unqualified sense (D) All of the above\nA:\
+  \ Let's think step by step. We refer to Wikipedia articles on logical fallacies\
+  \ for help. A valid categorical syllogism must satisfy several conditions: (1) the\
+  \ syllogism must have exactly three terms (2) every term of the syllogism must be\
+  \ used twice exactly, (3) a term may be used only once in any premise, and (4) the\
+  \ middle term must be used in at least one premise in a universal or unqualified\
+  \ sense, etc. Only (C) is true. The answer is (C).\n\nQ: If someone attacks the\
+  \ character of an opposing arguer, instead of responding to that opponent's arguments,\
+  \ the first person has probably committed which of the following fallacies?\n(A)\
+  \ tu quoque (B) horse laugh (C) argument against the person (D) ignoratio elenchi\n\
+  A: Let's think step by step. We refer to Wikipedia articles on logical fallacies\
+  \ for help. The argument against the person fallacy occurs when someone irrelevantly\
+  \ attacks the character of an opposing arguer, instead of addressing that opponent's\
+  \ arguments. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
index 1856af53fbc71c5fb98ed4ff29ad65986fc7ceb3..3a99b9086a04d0d66a9e980b5edcaa3019f98964 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
@@ -1,5 +1,5 @@
-dataset_name: machine_learning
-description: "The following are multiple choice questions (with answers) about machine\
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\nQ: Which image data augmentation is most common for natural images?\n\
   (A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization\
   \ (D) dithering\nA: Let's think step by step. Data augmentation is used to increase\
@@ -12,48 +12,49 @@ description: "The following are multiple choice questions (with answers) about m
   \ learning we consider a binary split according to whether the attribute is above\
   \ or below some threshold. Pat suggests that instead we should just have a multiway\
   \ split with one branch for each of the distinct values of the attribute. From the\
-  \ list below choose the single biggest problem with Pat\u2019s suggestion:\n(A)\
-  \ It is too computationally expensive. (B) It would probably result in a decision\
-  \ tree that scores badly on the training set and a testset. (C) It would probably\
-  \ result in a decision tree that scores well on the training set but badly on a\
-  \ testset. (D) It would probably result in a decision tree that scores well on a\
-  \ testset but badly on a training set.\nA: Let's think step by step. Because the\
-  \ input is real valued, it is unlikely that the same values appear both at training\
-  \ and test time. This means that while such a decision tree could yield good performance\
+  \ list below choose the single biggest problem with Pat’s suggestion:\n(A) It is\
+  \ too computationally expensive. (B) It would probably result in a decision tree\
+  \ that scores badly on the training set and a testset. (C) It would probably result\
+  \ in a decision tree that scores well on the training set but badly on a testset.\
+  \ (D) It would probably result in a decision tree that scores well on a testset\
+  \ but badly on a training set.\nA: Let's think step by step. Because the input is\
+  \ real valued, it is unlikely that the same values appear both at training and test\
+  \ time. This means that while such a decision tree could yield good performance\
   \ on the training data, when evaluated on the test data it will perform badly because\
-  \ the decision tree won\u2019t know what to do with numbers that did not appear\
-  \ in the training data. The answer is (C).\n\nQ: You are reviewing papers for the\
-  \ World\u2019s Fanciest Machine Learning Conference, and you see submissions with\
-  \ the following claims. Which ones would you consider accepting?\n(A) My method\
-  \ achieves a training error lower than all previous methods! (B) My method achieves\
-  \ a test error lower than all previous methods! (Footnote: When regularisation parameter\
-  \ \u03BB is chosen so as to minimise test error.) (C) My method achieves a test\
+  \ the decision tree won’t know what to do with numbers that did not appear in the\
+  \ training data. The answer is (C).\n\nQ: You are reviewing papers for the World’s\
+  \ Fanciest Machine Learning Conference, and you see submissions with the following\
+  \ claims. Which ones would you consider accepting?\n(A) My method achieves a training\
+  \ error lower than all previous methods! (B) My method achieves a test error lower\
+  \ than all previous methods! (Footnote: When regularisation parameter λ is chosen\
+  \ so as to minimise test error.) (C) My method achieves a test error lower than\
+  \ all previous methods! (Footnote: When regularisation parameter λ is chosen so\
+  \ as to minimise cross-validaton error.) (D) My method achieves a cross-validation\
   \ error lower than all previous methods! (Footnote: When regularisation parameter\
-  \ \u03BB is chosen so as to minimise cross-validaton error.) (D) My method achieves\
-  \ a cross-validation error lower than all previous methods! (Footnote: When regularisation\
-  \ parameter \u03BB is chosen so as to minimise cross-validaton error.)\nA: Let's\
-  \ think step by step. In machine learning, we train with some data and fixed hyperparameters\
-  \ and the training error can be arbitrarily low, so (A) can\u2019t be right. Then,\
-  \ one compares different hyperparameters by selecting the model with the lowest\
-  \ cross-validation error, this means that (B) and (D) are not the right procedure.\
-  \ The only relevant number after these is the test error and thus (C) is the right\
-  \ answer. The answer is (C).\n\nQ: A 6-sided die is rolled 15 times and the results\
-  \ are: side 1 comes up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times;\
-  \ side 5: 4 times; side 6: 5 times. Based on these results, what is the probability\
-  \ of side 3 coming up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16\
-  \ (D) 1.0/5\nA: Let's think step by step. Add-1 smoothing adds the value of one\
-  \ to the different counts and then normalizes the probabilities accordingly. The\
-  \ counts after adding one will be: side 1 comes up 1 time; side 2: 2 times; side\
-  \ 3: 3 times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum\
-  \ one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7.\
-  \ The answer is (B).\n\nQ: To achieve an 0/1 loss estimate that is less than 1 percent\
-  \ of the true 0/1 loss (with probability 95%), according to Hoeffding's inequality\
-  \ the IID test set must have how many examples?\n(A) around 10 examples (B) around\
-  \ 100 examples (C) between 100 and 500 examples (D) more than 1000 examples\nA:\
-  \ Let's think step by step. By the Hoeffding\u2019s inequality, we expect that with\
-  \ 95% probability the in-sample and out-of-sample errors differ by epsilon when\
-  \ we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2)\
-  \ log ( 0.05/2 )= log (40)*5000. Since log(40)>1, we have that one needs more than\
-  \ 1000 examples. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_machine_learning
+  \ λ is chosen so as to minimise cross-validaton error.)\nA: Let's think step by\
+  \ step. In machine learning, we train with some data and fixed hyperparameters and\
+  \ the training error can be arbitrarily low, so (A) can’t be right. Then, one compares\
+  \ different hyperparameters by selecting the model with the lowest cross-validation\
+  \ error, this means that (B) and (D) are not the right procedure. The only relevant\
+  \ number after these is the test error and thus (C) is the right answer. The answer\
+  \ is (C).\n\nQ: A 6-sided die is rolled 15 times and the results are: side 1 comes\
+  \ up 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times;\
+  \ side 6: 5 times. Based on these results, what is the probability of side 3 coming\
+  \ up when using Add-1 Smoothing?\n(A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5\nA:\
+  \ Let's think step by step. Add-1 smoothing adds the value of one to the different\
+  \ counts and then normalizes the probabilities accordingly. The counts after adding\
+  \ one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3 times; side 4:\
+  \ 4 times; side 5: 5 times; side 6: 6 times. The number of sum one die rolls will\
+  \ be 21, so the probability of drawing a three is 3/21 = 1/7. The answer is (B).\n\
+  \nQ: To achieve an 0/1 loss estimate that is less than 1 percent of the true 0/1\
+  \ loss (with probability 95%), according to Hoeffding's inequality the IID test\
+  \ set must have how many examples?\n(A) around 10 examples (B) around 100 examples\
+  \ (C) between 100 and 500 examples (D) more than 1000 examples\nA: Let's think step\
+  \ by step. By the Hoeffding’s inequality, we expect that with 95% probability the\
+  \ in-sample and out-of-sample errors differ by epsilon when we have N samples if\
+  \ 2 exp(-2 epsilon^2 N)<0.05, this implies that N > -1/(2*epsilon**2) log ( 0.05/2\
+  \ )= log (40)*5000. Since log(40)>1, we have that one needs more than 1000 examples.\
+  \ The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_stem"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
index db2f9642c91381480700438db6436127f53c9bee..1259e076824496a0cc876673684ec131093909bb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
@@ -1,54 +1,33 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  Q: How can organisational structures that are characterised by democratic and inclusive
-  styles of management be described?
-
-  (A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  Flat organizational structures are characterized by democratic and inclusive styles
-  of management, and have few (if any) levels of management between the workers and
-  managers.  The answer is (C).
-
-
-  Q: Hygiene factors are associated with which writer?
-
-  (A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  Hygiene factors include compensation, company policies, supervision, interpersonal
-  relations, and work environments. Hertzberg lists them as factors that cannot motivate
-  employees but can minimize job dissatisfaction. The answer is (A).
-
-
-  Q: What characteristic is not a key feature of the ''open systems'' model of management?
-
-  (A) Morale (B) Innovation (C) Growth resource (D) Adaptation
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The key characteristics of an open system in management include innovation, growth
-  resource, and adaption, but do not include morale. The answer is (A).
-
-
-  Q: Which element of the cultural web forms regalia?
-
-  (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The cultural web is a tool for mapping an organization''s culture, where symbols
-  form the regalia that visually expresses the values that the organization holds
-  as important. The answer is (A).
-
-
-  Q: What are the two main dimensions of the Ohio Studies into leadership?
-
-  (A) Starting position and end position (B) Initial environment and changed environment
-  (C) Organisational structure and conditioning (D) Initiating structure and considerations
-
-  A: Let''s think step by step. We refer to Wikipedia articles on management for help.
-  The Ohio State Leadership Studies conducted in the 1940s identified initiating structure
-  and consideration as the two main dimensions of leader behavior. The answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_management
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \nQ: How can organisational structures that are characterised by democratic and\
+  \ inclusive styles of management be described?\n(A) Hierarchical (B) Bureaucratic\
+  \ (C) Flat (D) Functional\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on management for help. Flat organizational structures are characterized by democratic\
+  \ and inclusive styles of management, and have few (if any) levels of management\
+  \ between the workers and managers.  The answer is (C).\n\nQ: Hygiene factors are\
+  \ associated with which writer?\n(A) Frederick Hertzberg (B) D.C. McClelland (C)\
+  \ Abraham Maslow (D) Douglas McGregor\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on management for help. Hygiene factors include compensation,\
+  \ company policies, supervision, interpersonal relations, and work environments.\
+  \ Hertzberg lists them as factors that cannot motivate employees but can minimize\
+  \ job dissatisfaction. The answer is (A).\n\nQ: What characteristic is not a key\
+  \ feature of the 'open systems' model of management?\n(A) Morale (B) Innovation\
+  \ (C) Growth resource (D) Adaptation\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on management for help. The key characteristics of an open system in\
+  \ management include innovation, growth resource, and adaption, but do not include\
+  \ morale. The answer is (A).\n\nQ: Which element of the cultural web forms regalia?\n\
+  (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems\n\
+  A: Let's think step by step. We refer to Wikipedia articles on management for help.\
+  \ The cultural web is a tool for mapping an organization's culture, where symbols\
+  \ form the regalia that visually expresses the values that the organization holds\
+  \ as important. The answer is (A).\n\nQ: What are the two main dimensions of the\
+  \ Ohio Studies into leadership?\n(A) Starting position and end position (B) Initial\
+  \ environment and changed environment (C) Organisational structure and conditioning\
+  \ (D) Initiating structure and considerations\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on management for help. The Ohio State Leadership Studies\
+  \ conducted in the 1940s identified initiating structure and consideration as the\
+  \ two main dimensions of leader behavior. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_management"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
index 5dd683daa319cb61a8838ff8e617340195cfe401..d8a6b9b8fe985666f07d417cc9dc30b11b9f293d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
@@ -1,66 +1,40 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  Q: Although the content and quality can be as controlled as direct mail, response
-  rates of this medium are lower because of the lack of a personal address mechanism.
-  This media format is known as:
-
-  (A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Door to door marketing delivers non-addressed items within all buildings within
-  a geographic area. While it can control the content and quality as well as direct
-  mail marketing, its response rate is lower because of the lack of a personal address
-  mechanism. The answer is (D).
-
-
-  Q: In an organization, the group of people tasked with buying decisions is referred
-  to as the _______________.
-
-  (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making
-  unit.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  In an organization, the group of the people tasked with buying decision is referred
-  to as the decision-making unit. The answer is (D).
-
-
-  Q: The single group within society that is most vulnerable to reference group influence
-  is:
-
-  (A) The older consumer who feels somewhat left out of things. (B) The married women,
-  many of whom feel a need for stability in their lives. (C) New immigrants who really
-  want to assimilate into their new culture. (D) Children, who base most of their
-  buying decisions on outside influences.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Children, who mostly based their buying decisions on outside influences, are the
-  single group within society that is more vulnerable to reference group influence.
-  The answer is (D).
-
-
-  Q: Which of the following is an assumption in Maslow''s hierarchy of needs?
-
-  (A) Needs are dependent on culture and also on social class. (B) Lower-level needs
-  must be at least partially satisfied before higher needs can affect behaviour. (C)
-  Needs are not prioritized or arranged in any particular order. (D) Satisfied needs
-  are motivators, and new needs emerge when current needs remain unmet.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Maslow''s hierarchy of needs, from the bottom upwards, are physiological (food and
-  clothing), safety, love and belonging needs, esteem, and self-actualization. Lower-level
-  needs must be at least partially satisfied before higher ones can affect behavior.
-  The answer is (B).
-
-
-  Q: _____________ is a natural outcome when combining demographic and geographic
-  variables.
-
-  (A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand management.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on marketing for help.
-  Geodemographics is a natural outcome when combining demographic and geographic variables.
-  The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_marketing
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \nQ: Although the content and quality can be as controlled as direct mail, response\
+  \ rates of this medium are lower because of the lack of a personal address mechanism.\
+  \ This media format is known as:\n(A) Care lines. (B) Direct mail. (C) Inserts.\
+  \ (D) Door to door.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on marketing for help. Door to door marketing delivers non-addressed items within\
+  \ all buildings within a geographic area. While it can control the content and quality\
+  \ as well as direct mail marketing, its response rate is lower because of the lack\
+  \ of a personal address mechanism. The answer is (D).\n\nQ: In an organization,\
+  \ the group of people tasked with buying decisions is referred to as the _______________.\n\
+  (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D) Decision-making\
+  \ unit.\nA: Let's think step by step. We refer to Wikipedia articles on marketing\
+  \ for help. In an organization, the group of the people tasked with buying decision\
+  \ is referred to as the decision-making unit. The answer is (D).\n\nQ: The single\
+  \ group within society that is most vulnerable to reference group influence is:\n\
+  (A) The older consumer who feels somewhat left out of things. (B) The married women,\
+  \ many of whom feel a need for stability in their lives. (C) New immigrants who\
+  \ really want to assimilate into their new culture. (D) Children, who base most\
+  \ of their buying decisions on outside influences.\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on marketing for help. Children, who mostly based\
+  \ their buying decisions on outside influences, are the single group within society\
+  \ that is more vulnerable to reference group influence. The answer is (D).\n\nQ:\
+  \ Which of the following is an assumption in Maslow's hierarchy of needs?\n(A) Needs\
+  \ are dependent on culture and also on social class. (B) Lower-level needs must\
+  \ be at least partially satisfied before higher needs can affect behaviour. (C)\
+  \ Needs are not prioritized or arranged in any particular order. (D) Satisfied needs\
+  \ are motivators, and new needs emerge when current needs remain unmet.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on marketing for help. Maslow's\
+  \ hierarchy of needs, from the bottom upwards, are physiological (food and clothing),\
+  \ safety, love and belonging needs, esteem, and self-actualization. Lower-level\
+  \ needs must be at least partially satisfied before higher ones can affect behavior.\
+  \ The answer is (B).\n\nQ: _____________ is a natural outcome when combining demographic\
+  \ and geographic variables.\n(A) Geodemographics (B) Product differentiation. (C)\
+  \ ANSOFF matrix. (D) Brand management.\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on marketing for help. Geodemographics is a natural outcome\
+  \ when combining demographic and geographic variables. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
index ebf699aab3f85bd60df88e1032e076e5cdc11ffb..bf77059221f91d588b7d2e436b0188f7c3faefad 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
@@ -1,61 +1,37 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
-
-
-  Q: The stage of meiosis in which chromosomes pair and cross over is:
-
-  (A) prophase I (B) metaphase I (C) prophase II (D) metaphase II
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Prophase I is the stage of meiosis where homologous chromosomes pair with
-  each other and exchange genetic material. The answer is (A).
-
-
-  Q: DNA ligase is
-
-  (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial
-  origin which cuts DNA at defined base sequences (C) an enzyme that facilitates transcription
-  of specific genes (D) an enzyme which limits the level to which a particular nutrient
-  reaches
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining DNA
-  strands together by catalyzing a phosphodiester bond. The answer is (A).
-
-
-  Q: Which of the following conditions does not show multifactorial inheritance?
-
-  (A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects) (D)
-  Marfan syndrome
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Multifactorial inheritance is when more than a single factor is responsible
-  for causing a given trait or health problem. Genes cannot be the only factor. Marfan
-  syndrome, on the other hand, requires only one abnormal copy of the of the Marfan
-  gene, from one parent, to inherit the trait. The answer is (D).
-
-
-  Q: A gene showing codominance
-
-  (A) has both alleles independently expressed in the heterozygote (B) has one allele
-  dominant to the other (C) has alleles tightly linked on the same chromosome (D)
-  has alleles expressed at the same time in development
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. Codominance, as it relates to genetics, refers to a type of genetic inheritance
-  where the phenotype of both the parents is easily observed in the offspring. A heterozygote
-  is an individual having two different alleles of a gene. The answer is (A).
-
-
-  Q: Large triplet repeat expansions can be detected by:
-
-  (A) polymerase chain reaction. (B) single strand conformational polymorphism analysis.
-  (C) Southern blotting. (D) Western blotting.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on medical genetics
-  for help. A Southern blot is a method in molecular biology for detecting specific
-  DNA sequences in a sample. Large triplet repeat expansions are usually detected
-  with this method. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_medical_genetics
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\nQ: The stage of meiosis in which chromosomes pair and cross over\
+  \ is:\n(A) prophase I (B) metaphase I (C) prophase II (D) metaphase II\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on medical genetics for help.\
+  \ Prophase I is the stage of meiosis where homologous chromosomes pair with each\
+  \ other and exchange genetic material. The answer is (A).\n\nQ: DNA ligase is\n\
+  (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of bacterial\
+  \ origin which cuts DNA at defined base sequences (C) an enzyme that facilitates\
+  \ transcription of specific genes (D) an enzyme which limits the level to which\
+  \ a particular nutrient reaches\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. DNA ligase is a type of enzyme (EC 6.5.1.1)\
+  \ responsible for joining DNA strands together by catalyzing a phosphodiester bond.\
+  \ The answer is (A).\n\nQ: Which of the following conditions does not show multifactorial\
+  \ inheritance?\n(A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural\
+  \ tube defects) (D) Marfan syndrome\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. Multifactorial inheritance is when more\
+  \ than a single factor is responsible for causing a given trait or health problem.\
+  \ Genes cannot be the only factor. Marfan syndrome, on the other hand, requires\
+  \ only one abnormal copy of the of the Marfan gene, from one parent, to inherit\
+  \ the trait. The answer is (D).\n\nQ: A gene showing codominance\n(A) has both alleles\
+  \ independently expressed in the heterozygote (B) has one allele dominant to the\
+  \ other (C) has alleles tightly linked on the same chromosome (D) has alleles expressed\
+  \ at the same time in development\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on medical genetics for help. Codominance, as it relates to genetics,\
+  \ refers to a type of genetic inheritance where the phenotype of both the parents\
+  \ is easily observed in the offspring. A heterozygote is an individual having two\
+  \ different alleles of a gene. The answer is (A).\n\nQ: Large triplet repeat expansions\
+  \ can be detected by:\n(A) polymerase chain reaction. (B) single strand conformational\
+  \ polymorphism analysis. (C) Southern blotting. (D) Western blotting.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on medical genetics for help.\
+  \ A Southern blot is a method in molecular biology for detecting specific DNA sequences\
+  \ in a sample. Large triplet repeat expansions are usually detected with this method.\
+  \ The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
index a506e9403badec9987c4d56227a659e8927e2dd7..0075bd64f5c99327e9024542eb2add49adf7cbc1 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
@@ -1,54 +1,27 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  Q: Which of these songs was a Top 10 hit for the rock band The Police?
-
-  (A) ''Radio Ga-Ga'' (B) ''Ob-la-di Ob-la-da'' (C) ''De Do Do Do De Da Da Da'' (D)
-  ''In-a-Gadda-Da-Vida''
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by
-  Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly.
-  Leaving ''De Do Do Do De Da Da Da'' as the only song by The Police, and also a Top
-  10 hit. The answer is (C).
-
-
-  Q: What place is named in the title of the 1979 live album by rock legends Cheap
-  Trick?
-
-  (A) Budapest (B) Budokan (C) Bhutan (D) Britain
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Nippon Budokan is
-  an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including
-  Cheap Trick in 1978. ''Cheap Trick at Budokan'' became the name of their album.
-  The answer is (B).
-
-
-  Q: What is produced during photosynthesis?
-
-  (A) hydrogen (B) nylon (C) oxygen (D) light
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Photosynthesis is
-  the process in which green plants use the green pigment chlorophyll to synthesize
-  foods with water and carbon dioxide. Oxygen is the byproduct of this process. The
-  answer is (C).
-
-
-  Q: Who is the shortest man to ever win an NBA slam dunk competition?
-
-  (A) Anthony ''Spud'' Webb (B) Michael ''Air'' Jordan (C) Tyrone ''Muggsy'' Bogues
-  (D) Julius ''Dr J'' Erving
-
-  A: Let''s think step by step. We refer to Wikipedia for help. In 1986, Spud Webb,
-  standing only 5''7" became the shortest NBA player in history to win an official
-  slam dunk contest. The answer is (A).
-
-
-  Q: How many axles does a standard automobile have?
-
-  (A) one (B) two (C) four (D) eight
-
-  A: Let''s think step by step. We refer to Wikipedia for help. Most cars have two
-  axles to rotate the wheels.. The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_miscellaneous
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \nQ: Which of these songs was a Top 10 hit for the rock band The Police?\n(A) 'Radio\
+  \ Ga-Ga' (B) 'Ob-la-di Ob-la-da' (C) 'De Do Do Do De Da Da Da' (D) 'In-a-Gadda-Da-Vida'\n\
+  A: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga is by Queen.\
+  \ Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is by Iron Butterfly.\
+  \ Leaving 'De Do Do Do De Da Da Da' as the only song by The Police, and also a Top\
+  \ 10 hit. The answer is (C).\n\nQ: What place is named in the title of the 1979\
+  \ live album by rock legends Cheap Trick?\n(A) Budapest (B) Budokan (C) Bhutan (D)\
+  \ Britain\nA: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan\
+  \ is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts including\
+  \ Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their album.\
+  \ The answer is (B).\n\nQ: What is produced during photosynthesis?\n(A) hydrogen\
+  \ (B) nylon (C) oxygen (D) light\nA: Let's think step by step. We refer to Wikipedia\
+  \ for help. Photosynthesis is the process in which green plants use the green pigment\
+  \ chlorophyll to synthesize foods with water and carbon dioxide. Oxygen is the byproduct\
+  \ of this process. The answer is (C).\n\nQ: Who is the shortest man to ever win\
+  \ an NBA slam dunk competition?\n(A) Anthony 'Spud' Webb (B) Michael 'Air' Jordan\
+  \ (C) Tyrone 'Muggsy' Bogues (D) Julius 'Dr J' Erving\nA: Let's think step by step.\
+  \ We refer to Wikipedia for help. In 1986, Spud Webb, standing only 5'7\" became\
+  \ the shortest NBA player in history to win an official slam dunk contest. The answer\
+  \ is (A).\n\nQ: How many axles does a standard automobile have?\n(A) one (B) two\
+  \ (C) four (D) eight\nA: Let's think step by step. We refer to Wikipedia for help.\
+  \ Most cars have two axles to rotate the wheels.. The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
index fdde36334d57e6273b276e70ba1ff49217287338..e2c301928645ee416c01fd22c658f178850fd2b8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
@@ -1,72 +1,49 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
-
-
-  Q: Baron admits that the versions of the ticking bomb hypothetical she discusses
-  are "stunningly stupid," but she claims this is actually evidence of
-
-  (A) the stupidity of most traditional philosophical examples. (B) a general lack
-  of intelligence among people with advanced degrees. (C) the wrongness of torture.
-  (D) the readiness on the part of many intelligent people to see torture as the best
-  solution to deal with terrorism.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. The ticking bomb hypothetical poses a problem where many people will die to
-  an exploding bomb, if the hypothetical terrorist does not disclose how to defuse
-  it. Baron sees this hypothetical as silly, but its prevalence does suggest intelligent
-  people, particularly utilitarians, see torture as justifiable to save the lives
-  in this scenario. The answer is (D).
-
-
-  Q: A fertilized ovum is also known as
-
-  (A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Once a single sperm penetrates the layers of an egg to form a new cell, that
-  cell is called a zygote. The answer is (A).
-
-
-  Q: Pence compares six different cases of reproduction, from natural twinning to
-  SCNT. What conclusion does he draw from this comparison?
-
-  (A) SCNT is not a different kind of reproduction because there are no morally relevant
-  differences between it and other permissible means of reproduction. (B) Because
-  there is a low risk of harm for natural twinning, there will be a low risk of harm
-  for SCNT. (C) both A and B (D) neither A nor B
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. SCNT, known as ''somatic cell nuclear transfer'', is a technique in which
-  the nucleus of a somatic (body) cell is transferred to the cytoplasm of an enucleated
-  egg. This is used in farm animal cloning. Pence may compare this to other processes
-  of reproduction, and see little moral separation. The answer is (A).
-
-
-  Q: Which of the following is an intrinsic good, according to Singer?
-
-  (A) being in the upper class (B) the ability to learn and solve problems (C) being
-  more beautiful than most of the population (D) being shorter than average
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Bioethicist Peter Singer sees intrinsic value as innate values conferred by
-  oneself, for oneself. Innanimate objects can be beautiful, short, or have some valuable
-  criteria, but capabilities are intrinsically good. The answer is (B).
-
-
-  Q: According to Metz, what is wrong with consequentialist arguments against capital
-  punishment based on African values?
-
-  (A) It is unclear as of yet whether or not capital punishment deters harm to the
-  community. (B) It is unclear as of yet whether or not capital punishment deters
-  harm to any individuals. (C) Consequentialism is not supported by African values.
-  (D) Even though consequentialism is supported by African values, no consequentialist
-  arguments framed in terms of African values have been offered.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral disputes for
-  help. Thaddeus Metz is a humanities research professor focusing on ethical philosophy
-  in South Africa. Metz has written the death penalty is unjustified as it treats
-  individuals as incapable of communal relations. It is unclear that capital punishment
-  is to the benefit of, or a deterrent of harm to the community. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_moral_disputes
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\nQ: Baron admits that the versions of the ticking bomb hypothetical\
+  \ she discusses are \"stunningly stupid,\" but she claims this is actually evidence\
+  \ of\n(A) the stupidity of most traditional philosophical examples. (B) a general\
+  \ lack of intelligence among people with advanced degrees. (C) the wrongness of\
+  \ torture. (D) the readiness on the part of many intelligent people to see torture\
+  \ as the best solution to deal with terrorism.\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on moral disputes for help. The ticking bomb hypothetical\
+  \ poses a problem where many people will die to an exploding bomb, if the hypothetical\
+  \ terrorist does not disclose how to defuse it. Baron sees this hypothetical as\
+  \ silly, but its prevalence does suggest intelligent people, particularly utilitarians,\
+  \ see torture as justifiable to save the lives in this scenario. The answer is (D).\n\
+  \nQ: A fertilized ovum is also known as\n(A) a zygote. (B) an embryo. (C) a viability.\
+  \ (D) a blastocyst.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. Once a single sperm penetrates the layers of an egg\
+  \ to form a new cell, that cell is called a zygote. The answer is (A).\n\nQ: Pence\
+  \ compares six different cases of reproduction, from natural twinning to SCNT. What\
+  \ conclusion does he draw from this comparison?\n(A) SCNT is not a different kind\
+  \ of reproduction because there are no morally relevant differences between it and\
+  \ other permissible means of reproduction. (B) Because there is a low risk of harm\
+  \ for natural twinning, there will be a low risk of harm for SCNT. (C) both A and\
+  \ B (D) neither A nor B\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. SCNT, known as 'somatic cell nuclear transfer', is\
+  \ a technique in which the nucleus of a somatic (body) cell is transferred to the\
+  \ cytoplasm of an enucleated egg. This is used in farm animal cloning. Pence may\
+  \ compare this to other processes of reproduction, and see little moral separation.\
+  \ The answer is (A).\n\nQ: Which of the following is an intrinsic good, according\
+  \ to Singer?\n(A) being in the upper class (B) the ability to learn and solve problems\
+  \ (C) being more beautiful than most of the population (D) being shorter than average\n\
+  A: Let's think step by step. We refer to Wikipedia articles on moral disputes for\
+  \ help. Bioethicist Peter Singer sees intrinsic value as innate values conferred\
+  \ by oneself, for oneself. Innanimate objects can be beautiful, short, or have some\
+  \ valuable criteria, but capabilities are intrinsically good. The answer is (B).\n\
+  \nQ: According to Metz, what is wrong with consequentialist arguments against capital\
+  \ punishment based on African values?\n(A) It is unclear as of yet whether or not\
+  \ capital punishment deters harm to the community. (B) It is unclear as of yet whether\
+  \ or not capital punishment deters harm to any individuals. (C) Consequentialism\
+  \ is not supported by African values. (D) Even though consequentialism is supported\
+  \ by African values, no consequentialist arguments framed in terms of African values\
+  \ have been offered.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral disputes for help. Thaddeus Metz is a humanities research professor focusing\
+  \ on ethical philosophy in South Africa. Metz has written the death penalty is unjustified\
+  \ as it treats individuals as incapable of communal relations. It is unclear that\
+  \ capital punishment is to the benefit of, or a deterrent of harm to the community.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
index 4c82f5d5b892266fbe226d7c620f78e142f9144a..cb2599e21d2494296c31fcf1bcfab14be2f5378e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
@@ -1,66 +1,44 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I took his wallet after he harassed me at the bar. Scenario
-  2 | I made love to my newborn baby boy.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Stealing from someone is not considered an ethical response to harrassment.
-  And having sex with a minor is egregiously unethical. The answer is (A).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I asked for a cigarette from the guy outside the store.
-  Scenario 2 | When the security guard left his post, I pocketed a rare coin.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Asking for something is not morally wrong. Stealing is morally wrong.
-  The answer is (C).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | During the house''s demolition, I pushed him into a wall.
-  Scenario 2 | When I choose a babysitter for my twin nieces, I don''t care about
-  her racial background.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Physical violence is morally wrong. Being open to hiring people from different
-  racial backgrounds is an expected moral stance. The answer is (B).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario 2 | I
-  looked up at the gorgeous waterfall and threw a rock at it.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Neither of these are wrong as no one is wronged by buing a pick-up truck
-  or throwing a rock into a stream. The answer is (D).
-
-
-  Q: For which of these two scenarios does the main character (who uses I/me/my) do
-  something clearly morally wrong, according to ordinary moral standards in the US
-  as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The new
-  girl is embarrassed to live in a trailer park, so I invite the class to her house.
-
-  (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong
-
-  A: Let''s think step by step. We refer to Wikipedia articles on moral scenarios
-  for help. Loving someone is not wrong. However, exposing something that someone
-  is embarrassed about could be considered quite mean. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_moral_scenarios
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\nQ: For which of these two scenarios does the main character (who\
+  \ uses I/me/my) do something clearly morally wrong, according to ordinary moral\
+  \ standards in the US as of 2020? Scenario 1 | I took his wallet after he harassed\
+  \ me at the bar. Scenario 2 | I made love to my newborn baby boy.\n(A) Wrong, Wrong\
+  \ (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on moral scenarios for help. Stealing\
+  \ from someone is not considered an ethical response to harrassment. And having\
+  \ sex with a minor is egregiously unethical. The answer is (A).\n\nQ: For which\
+  \ of these two scenarios does the main character (who uses I/me/my) do something\
+  \ clearly morally wrong, according to ordinary moral standards in the US as of 2020?\
+  \ Scenario 1 | I asked for a cigarette from the guy outside the store. Scenario\
+  \ 2 | When the security guard left his post, I pocketed a rare coin.\n(A) Wrong,\
+  \ Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not wrong\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on moral scenarios for help.\
+  \ Asking for something is not morally wrong. Stealing is morally wrong. The answer\
+  \ is (C).\n\nQ: For which of these two scenarios does the main character (who uses\
+  \ I/me/my) do something clearly morally wrong, according to ordinary moral standards\
+  \ in the US as of 2020? Scenario 1 | During the house's demolition, I pushed him\
+  \ into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don't\
+  \ care about her racial background.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not\
+  \ wrong, Wrong (D) Not wrong, Not wrong\nA: Let's think step by step. We refer to\
+  \ Wikipedia articles on moral scenarios for help. Physical violence is morally wrong.\
+  \ Being open to hiring people from different racial backgrounds is an expected moral\
+  \ stance. The answer is (B).\n\nQ: For which of these two scenarios does the main\
+  \ character (who uses I/me/my) do something clearly morally wrong, according to\
+  \ ordinary moral standards in the US as of 2020? Scenario 1 | I bought an old pick-up\
+  \ truck for work. Scenario 2 | I looked up at the gorgeous waterfall and threw a\
+  \ rock at it.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not\
+  \ wrong, Not wrong\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on moral scenarios for help. Neither of these are wrong as no one is wronged by\
+  \ buing a pick-up truck or throwing a rock into a stream. The answer is (D).\n\n\
+  Q: For which of these two scenarios does the main character (who uses I/me/my) do\
+  \ something clearly morally wrong, according to ordinary moral standards in the\
+  \ US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario 2 | The\
+  \ new girl is embarrassed to live in a trailer park, so I invite the class to her\
+  \ house.\n(A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong,\
+  \ Not wrong\nA: Let's think step by step. We refer to Wikipedia articles on moral\
+  \ scenarios for help. Loving someone is not wrong. However, exposing something that\
+  \ someone is embarrassed about could be considered quite mean. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
index eae79250926b848481b87ee82bf004ffa89128e8..8af8d29a7f8c414d44a5a57883f162a4b3d09fc5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
@@ -1,72 +1,48 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  Q: What is the first-line drug for patients with type 2 diabetes and obesity, as
-  of 2020?
-
-  (A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Metformin (Fortamet, Glumetza, or others) is usually the first medication prescribed
-  for type 2 diabetes, as well as obesity. It works by lowering glucose production
-  in the liver and improving the body''s sensitivity to insulin. The answer is (B).
-
-
-  Q: Which of the following statements is correct (according to knowledge in 2020)?
-
-  (A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame
-  (B) Consumers with phenylketonuria must avoid the consumption of the sweetener saccharin
-  (C) Consumers with phenylketonuria must avoid the consumption of the sweetener sucralose
-  (D) Consumers with phenylketonuria must avoid the consumption of the sweetener acesulfame
-  K
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  People with phenylketonuria (PKU) cannot break down the amino acid phenylalanine.
-  As it builds up in the blood and brain it can lead to brain damage. People with
-  PKU should avoid foods that are converted to phenylalanine in the body, such as
-  aspartame. The answer is (A).
-
-
-  Q: Which of the following statements about iodine is correct, as of 2020?
-
-  (A) 50% of adults consume iodine at levels below the RNI (B) Dairy products are
-  a poor source of iodine (C) The iodine content of organic milk is generally lower
-  that the level in non-organic milk (D) UK dietary reference values recommend an
-  increase in iodine intake in pregnancy
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Organic milk usually has less iodine content than non-organic milk. The answer is
-  (C).
-
-
-  Q: Which of the following is the most plausible explanation for the protective effect
-  of dietary fibre against cancer of the colon, as of 2020?
-
-  (A) Propionic acid, formed during colonic fibre fermentation inhibits liver fatty
-  acid synthesis (B) Butyric acid, formed during colonic fibre fermentation stimulates
-  "silencing" of the SLC5A8 tumour suppressor gene (C) None of these options are correct
-  (D) Butyric acid, formed during colonic fibre fermentation stimulates anti-oxidant
-  defences in the colon
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  Dietary fibre is inversely proportional to the risk of colorectal cancer. This is
-  presumed because butyric acid (BA) stimulates antioxidants which help protect the
-  colon from cancerous tumors. The answer is (D).
-
-
-  Q: In a cohort study, the risk ratio of developing diabetes was 0.86 when comparing
-  consumers of tea (the exposed) to those who did not drink tea (the unexposed). Which
-  one statement is correct (according to knowledge in 2020)?
-
-  (A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers
-  have higher risk of developing diabetes. (C) Based on the information given we cannot
-  tell if the observed difference in disease risk is the result of chance. (D) The
-  risk ratio is close to the value one, so there is no difference in disease risk
-  between the two groups.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on nutrition for help.
-  The risk ratio is not sufficiently reduced that it could not be explained by random
-  chance given the studies sample size. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_nutrition
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \nQ: What is the first-line drug for patients with type 2 diabetes and obesity,\
+  \ as of 2020?\n(A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on nutrition for help. Metformin\
+  \ (Fortamet, Glumetza, or others) is usually the first medication prescribed for\
+  \ type 2 diabetes, as well as obesity. It works by lowering glucose production in\
+  \ the liver and improving the body's sensitivity to insulin. The answer is (B).\n\
+  \nQ: Which of the following statements is correct (according to knowledge in 2020)?\n\
+  (A) Consumers with phenylketonuria must avoid the consumption of the sweetener aspartame\
+  \ (B) Consumers with phenylketonuria must avoid the consumption of the sweetener\
+  \ saccharin (C) Consumers with phenylketonuria must avoid the consumption of the\
+  \ sweetener sucralose (D) Consumers with phenylketonuria must avoid the consumption\
+  \ of the sweetener acesulfame K\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on nutrition for help. People with phenylketonuria (PKU) cannot break\
+  \ down the amino acid phenylalanine. As it builds up in the blood and brain it can\
+  \ lead to brain damage. People with PKU should avoid foods that are converted to\
+  \ phenylalanine in the body, such as aspartame. The answer is (A).\n\nQ: Which of\
+  \ the following statements about iodine is correct, as of 2020?\n(A) 50% of adults\
+  \ consume iodine at levels below the RNI (B) Dairy products are a poor source of\
+  \ iodine (C) The iodine content of organic milk is generally lower that the level\
+  \ in non-organic milk (D) UK dietary reference values recommend an increase in iodine\
+  \ intake in pregnancy\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on nutrition for help. Organic milk usually has less iodine content than non-organic\
+  \ milk. The answer is (C).\n\nQ: Which of the following is the most plausible explanation\
+  \ for the protective effect of dietary fibre against cancer of the colon, as of\
+  \ 2020?\n(A) Propionic acid, formed during colonic fibre fermentation inhibits liver\
+  \ fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation\
+  \ stimulates \"silencing\" of the SLC5A8 tumour suppressor gene (C) None of these\
+  \ options are correct (D) Butyric acid, formed during colonic fibre fermentation\
+  \ stimulates anti-oxidant defences in the colon\nA: Let's think step by step. We\
+  \ refer to Wikipedia articles on nutrition for help. Dietary fibre is inversely\
+  \ proportional to the risk of colorectal cancer. This is presumed because butyric\
+  \ acid (BA) stimulates antioxidants which help protect the colon from cancerous\
+  \ tumors. The answer is (D).\n\nQ: In a cohort study, the risk ratio of developing\
+  \ diabetes was 0.86 when comparing consumers of tea (the exposed) to those who did\
+  \ not drink tea (the unexposed). Which one statement is correct (according to knowledge\
+  \ in 2020)?\n(A) The tea drinkers have lower risk of developing diabetes. (B) The\
+  \ tea drinkers have higher risk of developing diabetes. (C) Based on the information\
+  \ given we cannot tell if the observed difference in disease risk is the result\
+  \ of chance. (D) The risk ratio is close to the value one, so there is no difference\
+  \ in disease risk between the two groups.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on nutrition for help. The risk ratio is not sufficiently\
+  \ reduced that it could not be explained by random chance given the studies sample\
+  \ size. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
index 60ce6c547f25340938c95ece08b9721486529b26..5f52bc0c8378947d44086c691dfe18a9209e655f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
@@ -1,30 +1,30 @@
-dataset_name: philosophy
-description: "The following are multiple choice questions (with answers) about philosophy.\n\
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \nQ: The study of reality in the broadest sense, an inquiry into the elemental nature\
   \ of the universe and the things in it, is known as _____.\n(A) metaphysics (B)\
   \ epistemology (C) quantum physics (D) axiology\nA: Let's think step by step. We\
   \ refer to Wikipedia articles on philosophy for help. Among the options, only metaphysics\
   \ studies the nature of reality and existence. The answer is (A).\n\nQ: According\
-  \ to Moore\u2019s \u201Cideal utilitarianism,\u201D the right action is the one\
-  \ that brings about the greatest amount of:\n(A) pleasure. (B) happiness. (C) good.\
-  \ (D) virtue.\nA: Let's think step by step. We refer to Wikipedia articles on philosophy\
-  \ for help. Moore's \"ideal utilitarianism\" states that one's actions should maximize\
-  \ intrinsic goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion,\
-  \ what was his perspective on the meaning of life?\n(A) optimist (B) satisfied (C)\
-  \ nominally religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on philosophy for help. Before his conversion, Tolstoy feels that life\
-  \ was uncertain, which is a pessimist's point of view. The answer is (D).\n\nQ:\
-  \ According to d'Holbach, people always act according to _____.\n(A) free choices\
-  \ (B) dictates of the soul (C) necessary natural laws (D) undetermined will\nA:\
-  \ Let's think step by step. We refer to Wikipedia articles on philosophy for help.\
-  \ d'Holbach believes that people act according to necessary laws, and it proves\
-  \ nothing about people's free will. The answer is (C).\n\nQ: Psychological egoism\
-  \ is:\n(A) an ethical theory about how we ought to behave. (B) a generalization\
-  \ concerning the way people tend to behave. (C) a claim about human nature and the\
-  \ ways people are capable of behaving. (D) none of the above.\nA: Let's think step\
-  \ by step. We refer to Wikipedia articles on philosophy for help. Psychological\
-  \ egoism suggests that one behaves based on what makes one feels good, hence it\
-  \ is a claim about human nature and how humans are capable of behaving. The answer\
-  \ is (C)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_philosophy
+  \ to Moore’s “ideal utilitarianism,” the right action is the one that brings about\
+  \ the greatest amount of:\n(A) pleasure. (B) happiness. (C) good. (D) virtue.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on philosophy for help.\
+  \ Moore's \"ideal utilitarianism\" states that one's actions should maximize intrinsic\
+  \ goods. The answer is (C).\n\nQ: Before Tolstoy's Christian conversion, what was\
+  \ his perspective on the meaning of life?\n(A) optimist (B) satisfied (C) nominally\
+  \ religious (D) pessimist\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on philosophy for help. Before his conversion, Tolstoy feels that life was uncertain,\
+  \ which is a pessimist's point of view. The answer is (D).\n\nQ: According to d'Holbach,\
+  \ people always act according to _____.\n(A) free choices (B) dictates of the soul\
+  \ (C) necessary natural laws (D) undetermined will\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on philosophy for help. d'Holbach believes that\
+  \ people act according to necessary laws, and it proves nothing about people's free\
+  \ will. The answer is (C).\n\nQ: Psychological egoism is:\n(A) an ethical theory\
+  \ about how we ought to behave. (B) a generalization concerning the way people tend\
+  \ to behave. (C) a claim about human nature and the ways people are capable of behaving.\
+  \ (D) none of the above.\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on philosophy for help. Psychological egoism suggests that one behaves based on\
+  \ what makes one feels good, hence it is a claim about human nature and how humans\
+  \ are capable of behaving. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
index e1c8dcc6d9bd5a1b0d0f4d81c99fd9510bd4a196..dc350126017399442383736d1ecddd07412a4295 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
@@ -1,67 +1,42 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  Q: What is the approximate mean cranial capacity of Homo erectus?
-
-  (A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  The average cranium capacity of Homo erectus is less than 1000 cubic cm. The answer
-  is (C).
-
-
-  Q: According to Timothy Pauketat, the evidence for social stratification and political
-  power at Cahokia suggests:
-
-  (A) a center of Mississippian civilization with conditions similar to the rise of
-  early states. (B) the limitations of authority in a Native American society of egalitarian
-  foragers. (C) a simple chiefdom or perhaps a complex chiefdom had evolved by A.D.
-  1500. (D) a center of Mississippian civilization with conditions similar to societies
-  on the Northwest Coast of North America.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian
-  culture, where he found similar conditions to the rise of early states. The answer
-  is (A).
-
-
-  Q: Recent research on hominid species dating from the Middle Pliocene indicates
-  there was (as of 2020):
-
-  (A) a great amount of species diversity, or a single species that exhibited a lot
-  of diversity. (B) very little species diversity during this period and very few
-  hominids. (C) decreased species diversity due to a prolonged ice age followed by
-  a severe drought. (D) decreased species diversity but increased numbers of hammerstones
-  and flakes, indicating stone tool manufacture.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Recent research has recognized multiple hominid species from the Middle Pliocene,
-  meaning that there is a great amount of species diversity or diversity in a single
-  species. The answer is (A).
-
-
-  Q: Researchers now believe that the decline of the Maya was caused chiefly by:
-
-  (A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B) ecological
-  degradation resulting from slash-and-burn farming techniques. (C) endless wars between
-  neighboring Mayan city-states. (D) practices of interbreeding that led to a steep
-  rise in congenital disorders.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Researchers believe that the Maya collapse was mainly caused by over-exploitation
-  of natural resources like the slash-and-burn farming techniques. The answer is (B).
-
-
-  Q: The great Mayan king Pacal built temples in the city of Palenque in order to:
-
-  (A) satisfy the powerful Mayan astronomer priests. (B) display his generosity to
-  the common people, since they were allowed to live in the temples. (C) frighten
-  away enemies, in particular the Spaniards. (D) legitimize his kingship, since his
-  father was not royal.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on prehistory for help.
-  Pacal built the temples as the funerary monument to legitimize his kingship. The
-  answer is (D).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_prehistory
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \nQ: What is the approximate mean cranial capacity of Homo erectus?\n(A) under 650\
+  \ cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on prehistory for help. The average cranium\
+  \ capacity of Homo erectus is less than 1000 cubic cm. The answer is (C).\n\nQ:\
+  \ According to Timothy Pauketat, the evidence for social stratification and political\
+  \ power at Cahokia suggests:\n(A) a center of Mississippian civilization with conditions\
+  \ similar to the rise of early states. (B) the limitations of authority in a Native\
+  \ American society of egalitarian foragers. (C) a simple chiefdom or perhaps a complex\
+  \ chiefdom had evolved by A.D. 1500. (D) a center of Mississippian civilization\
+  \ with conditions similar to societies on the Northwest Coast of North America.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Timothy Pauketat is known for his research on Cahokia, the center of the Mississippian\
+  \ culture, where he found similar conditions to the rise of early states. The answer\
+  \ is (A).\n\nQ: Recent research on hominid species dating from the Middle Pliocene\
+  \ indicates there was (as of 2020):\n(A) a great amount of species diversity, or\
+  \ a single species that exhibited a lot of diversity. (B) very little species diversity\
+  \ during this period and very few hominids. (C) decreased species diversity due\
+  \ to a prolonged ice age followed by a severe drought. (D) decreased species diversity\
+  \ but increased numbers of hammerstones and flakes, indicating stone tool manufacture.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Recent research has recognized multiple hominid species from the Middle Pliocene,\
+  \ meaning that there is a great amount of species diversity or diversity in a single\
+  \ species. The answer is (A).\n\nQ: Researchers now believe that the decline of\
+  \ the Maya was caused chiefly by:\n(A) a cataclysm of some kind, such as an earthquake,\
+  \ volcano, or tsunami. (B) ecological degradation resulting from slash-and-burn\
+  \ farming techniques. (C) endless wars between neighboring Mayan city-states. (D)\
+  \ practices of interbreeding that led to a steep rise in congenital disorders.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on prehistory for help.\
+  \ Researchers believe that the Maya collapse was mainly caused by over-exploitation\
+  \ of natural resources like the slash-and-burn farming techniques. The answer is\
+  \ (B).\n\nQ: The great Mayan king Pacal built temples in the city of Palenque in\
+  \ order to:\n(A) satisfy the powerful Mayan astronomer priests. (B) display his\
+  \ generosity to the common people, since they were allowed to live in the temples.\
+  \ (C) frighten away enemies, in particular the Spaniards. (D) legitimize his kingship,\
+  \ since his father was not royal.\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on prehistory for help. Pacal built the temples as the funerary monument\
+  \ to legitimize his kingship. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
index c4957a1f821c9632e74b6cf751f2737c43995d00..57538d214d3d1e43addfb21aa686c8d51c0e7322 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
@@ -1,7 +1,7 @@
-dataset_name: professional_accounting
-description: "The following are multiple choice questions (with answers) about professional\
-  \ accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer\u2019\
-  s subledger. Which of the following management assertions is supported by this test?\n\
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\nQ: An auditor traces the serial numbers on equipment to a nonissuer’s\
+  \ subledger. Which of the following management assertions is supported by this test?\n\
   (A) Valuation and allocation (B) Completeness (C) Rights and obligations (D) Presentation\
   \ and disclosure\nA: Let's think step by step. We refer to Wikipedia articles on\
   \ accounting for help. The completeness assertion is tested by tracing supporting\
@@ -43,5 +43,6 @@ description: "The following are multiple choice questions (with answers) about p
   \ transactions, only Proceeds from long-term debt belongs to the financing activities\
   \ section of cashflow, hence the amount reported should be $100000. The answer is\
   \ (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_accounting
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
index f3a957db2a1fe1bb0ad2025fc142b6d0a617c04c..82d7a960775375b88c51f88358d19eebf5f5392c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
@@ -1,5 +1,5 @@
-dataset_name: professional_law
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\nQ: A son owed a creditor $5,000. The son's father contacted the creditor\
   \ and told him that he wanted to pay the son's debt. The father signed a document\
   \ that stated the father would pay the son's debt at a rate of $500 a month for\
@@ -101,5 +101,6 @@ description: "The following are multiple choice questions (with answers) about p
   \ Amendment further supports the First Amendment by establishing a due process clause.\
   \ Hence the strongest argument should be the statute is overbroad and consequently\
   \ invalid under the First and Fourteenth Amendments. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_law
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
index 89b890f80c07065204daa8dc44630c01efd2f49a..2263ae455376abe9c84c7c3d139a5ed8f9476c48 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
@@ -1,18 +1,18 @@
-dataset_name: professional_medicine
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\nQ: A 22-year-old male marathon runner presents to the office with\
   \ the complaint of right-sided rib pain when he runs long distances. Physical examination\
-  \ reveals normal heart and lung findings and an exhalation dysfunction at ribs\_\
-  4-5 on the right. Which of the following muscles or muscle groups will be most useful\
+  \ reveals normal heart and lung findings and an exhalation dysfunction at ribs 4-5\
+  \ on the right. Which of the following muscles or muscle groups will be most useful\
   \ in correcting this dysfunction utilizing a direct method?\n(A) anterior scalene\
   \ (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum\nA: Let's think\
   \ step by step. We refer to Wikipedia articles on medicine for help. Among the options,\
   \ only pectoralis minor muscle origins from the outer surfaces of the 3rd to 5th\
   \ ribs. The answer is (C).\n\nQ: A 36-year-old male presents to the office with\
-  \ a\_3-week\_history of low back pain. He denies any recent trauma but says that\
-  \ he climbs in and out of his truck numerous times a day for his job. Examination\
-  \ of the patient in the prone position reveals a deep sacral sulcus on the left,\
-  \ a posterior inferior lateral angle on the right, and a lumbosacral junction that\
+  \ a 3-week history of low back pain. He denies any recent trauma but says that he\
+  \ climbs in and out of his truck numerous times a day for his job. Examination of\
+  \ the patient in the prone position reveals a deep sacral sulcus on the left, a\
+  \ posterior inferior lateral angle on the right, and a lumbosacral junction that\
   \ springs freely on compression. The most likely diagnosis is\n(A) left-on-left\
   \ sacral torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion\
   \ (D) right-on-right sacral torsion\nA: Let's think step by step. We refer to Wikipedia\
@@ -23,9 +23,9 @@ description: "The following are multiple choice questions (with answers) about p
   \ nonproductive cough, runny nose, and frontal headache. He says the headache is\
   \ worse in the morning and ibuprofen does provide some relief. He has not had shortness\
   \ of breath. Medical history is unremarkable. He takes no medications other than\
-  \ the ibuprofen for pain. Vital signs are temperature 37.4\xB0C (99.4\xB0F), pulse\
-  \ 88/min, respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the\
-  \ nares shows erythematous mucous membranes. Examination of the throat shows erythema\
+  \ the ibuprofen for pain. Vital signs are temperature 37.4°C (99.4°F), pulse 88/min,\
+  \ respirations 18/min, and blood pressure 120/84 mm Hg. Examination of the nares\
+  \ shows erythematous mucous membranes. Examination of the throat shows erythema\
   \ and follicular lymphoid hyperplasia on the posterior oropharynx. There is no palpable\
   \ cervical adenopathy. Lungs are clear to auscultation. Which of the following is\
   \ the most likely cause of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr\
@@ -57,13 +57,14 @@ description: "The following are multiple choice questions (with answers) about p
   \ A follow-up visit in the office 2 weeks ago disclosed elevated urinary normetanephrine\
   \ and metanephrine and plasma aldosterone concentrations. The patient was referred\
   \ to a surgeon, who recommended the adrenalectomy. Today, vital signs are temperature\
-  \ 36.6\xB0C (97.9\xB0F), pulse 100/min, respirations 14/min, and blood pressure\
-  \ 170/95 mm Hg. Physical examination discloses no significant findings. Initial\
-  \ preoperative preparation should include treatment with which of the following?\n\
-  (A) Labetalol (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\n\
+  \ 36.6°C (97.9°F), pulse 100/min, respirations 14/min, and blood pressure 170/95\
+  \ mm Hg. Physical examination discloses no significant findings. Initial preoperative\
+  \ preparation should include treatment with which of the following?\n(A) Labetalol\
+  \ (B) A loading dose of potassium chloride (C) Nifedipine (D) Phenoxybenzamine\n\
   A: Let's think step by step. We refer to Wikipedia articles on medicine for help.\
   \ The symptoms and the adrenal mass suggested pheochromocytoma, and the blood pressure\
   \ indicates hypertension. Phenoxybenzamine is used to treat hypertension caused\
   \ by pheochromocytoma. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_medicine
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
index e1e5206d69ec4db9bdf8bacaa0f37df3242a0756..42a9a42e80b216be5c9b905f8dd5f45d8c1ae15c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
@@ -1,5 +1,5 @@
-dataset_name: professional_psychology
-description: "The following are multiple choice questions (with answers) about professional\
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\nQ: In the construction of a multiple regression equation for purposes\
   \ of prediction, the optimal combination of measures is one in which the predictors\n\
   (A) are uncorrelated with each other but are moderately correlated with the criterion\
@@ -18,30 +18,31 @@ description: "The following are multiple choice questions (with answers) about p
   \ step by step. We refer to Wikipedia articles on psychology for help. The definition\
   \ of mode is the most frequently occurring number. The answer is (D).\n\nQ: Carl\
   \ Jung believed that a client's transference:\n(A) is a fantasy that distracts the\
-  \ client from reality. (B) represents \u201Cmixed feelings\u201D toward the therapist.\
-  \ (C) \"is a form of \"\"acting out.\"\"\" (D) reflects the client\u2019s personal\
-  \ and collective unconscious.\nA: Let's think step by step. We refer to Wikipedia\
-  \ articles on psychology for help. Transference is a phenomenon that a person's\
-  \ feelings are unconsciously redirected, so it reflects the client's personal and\
-  \ collective unconscious. The answer is (D).\n\nQ: In terms of Hofstede\u2019s (1980)\
-  \ five cultural dimensions, the United States scores at the top of the scale on:\n\
-  (A) individualism. (B) individualism and power distance. (C) power distance and\
-  \ masculinity. (D) uncertainty avoidance.\nA: Let's think step by step. We refer\
-  \ to Wikipedia articles on psychology for help. US scores highest on individualism\
-  \ among the five cultural dimensions. The answer is (A).\n\nQ: One of your therapy\
-  \ clients asks your advice about a good weight- reduction program. You have investigated\
-  \ the programs in the community and are enrolled in the one you consider the best.\
-  \ This program offers a $50 bonus to its patrons for each new person they bring\
-  \ into the program. Under these circumstances, your most appropriate response would\
-  \ be to\n(A) tell your client the pros and cons of each program you know about except\
-  \ for the one in which you are enrolled (B) recommend to your client the program\
-  \ in which you are enrolled and explain the $50 bonus you will receive (C) recommend\
-  \ to your client the program in which you are enrolled and offer to have the $50\
-  \ bonus credited to your client's account in the program (D) tell your client the\
-  \ pros and cons of each program you know about, but do not claim the $50 bonus if\
-  \ your client enrolls in your program\nA: Let's think step by step. We refer to\
-  \ Wikipedia articles on psychology for help. Based on the circumstances, you should\
-  \ tell your client about the pros and cons of each program, but it would be inappropriate\
-  \ to receive the bonus, so you should not claim the $50 bonus. The answer is (D)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_professional_psychology
+  \ client from reality. (B) represents “mixed feelings” toward the therapist. (C)\
+  \ \"is a form of \"\"acting out.\"\"\" (D) reflects the client’s personal and collective\
+  \ unconscious.\nA: Let's think step by step. We refer to Wikipedia articles on psychology\
+  \ for help. Transference is a phenomenon that a person's feelings are unconsciously\
+  \ redirected, so it reflects the client's personal and collective unconscious. The\
+  \ answer is (D).\n\nQ: In terms of Hofstede’s (1980) five cultural dimensions, the\
+  \ United States scores at the top of the scale on:\n(A) individualism. (B) individualism\
+  \ and power distance. (C) power distance and masculinity. (D) uncertainty avoidance.\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ US scores highest on individualism among the five cultural dimensions. The answer\
+  \ is (A).\n\nQ: One of your therapy clients asks your advice about a good weight-\
+  \ reduction program. You have investigated the programs in the community and are\
+  \ enrolled in the one you consider the best. This program offers a $50 bonus to\
+  \ its patrons for each new person they bring into the program. Under these circumstances,\
+  \ your most appropriate response would be to\n(A) tell your client the pros and\
+  \ cons of each program you know about except for the one in which you are enrolled\
+  \ (B) recommend to your client the program in which you are enrolled and explain\
+  \ the $50 bonus you will receive (C) recommend to your client the program in which\
+  \ you are enrolled and offer to have the $50 bonus credited to your client's account\
+  \ in the program (D) tell your client the pros and cons of each program you know\
+  \ about, but do not claim the $50 bonus if your client enrolls in your program\n\
+  A: Let's think step by step. We refer to Wikipedia articles on psychology for help.\
+  \ Based on the circumstances, you should tell your client about the pros and cons\
+  \ of each program, but it would be inappropriate to receive the bonus, so you should\
+  \ not claim the $50 bonus. The answer is (D)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
index be4edf98392c771061d9144850fb60ba399a7161..87e32dccb789bf84146754bd906f9c8bd26569f0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
@@ -1,65 +1,39 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
-
-
-  Q: Earth Hour was a campaign launched by which organization?
-
-  (A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. Earth Hour is a worldwide movement oragnized launched by the World Wildlife
-  Fund. The answer is (D).
-
-
-  Q: In issues management, what is the most proactive approach to addressing negative
-  or misleading information posted online about your organization?
-
-  (A) Buy domain names that could be used by opposition groups. (B) Post anonymous
-  comments on blogs to combat this information. (C) Prepare a news release that discredits
-  the inaccurate information. (D) Make policy changes to address complaints highlighted
-  on these sites.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. In issues management, the most proactive approach to addressing negative
-  or misleading information posted online is to make policy changes to address complaints
-  highlighted on those sites. The answer is (D).
-
-
-  Q: At which stage in the planning process would a situation analysis be carried
-  out?
-
-  (A) Defining the program (B) Planning the program (C) Taking action and implementing
-  ideas (D) Evaluation of the program
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. Situation analyses are typically carried out during the planning process
-  stage of defining the program. The answer is (A).
-
-
-  Q: Which of these statements is true of the Vatican in 2010 at the time of the accusations
-  of child abuse cover-ups?
-
-  (A) There was a coordinated media response. (B) Consistent messages were communicated.
-  (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility
-  of the Vatican was upheld.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. In 2010 when there were accusations of child abuse cover-ups, the Vatican
-  took those criticisms as attacks on the Catholic Church. The answer is (C).
-
-
-  Q: What should a public relations media practitioner do if she does not know the
-  answer to a reporter''s question?
-
-  (A) Give the reporter other information she is certain is correct. (B) Say that
-  the information is ''off the record'' and will be disseminated later. (C) Say ''I
-  don''t know'' and promise to provide the information later. (D) Say ''no comment,''
-  rather than appear uninformed.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on public relations
-  for help. If a public relations media practitioner does not know the answer to a
-  reporter''s question, they should say ''I don''t know'' and offer to provide the
-  information later. The answer is (C).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_public_relations
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\nQ: Earth Hour was a campaign launched by which organization?\n(A)\
+  \ Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on public relations for help. Earth Hour\
+  \ is a worldwide movement oragnized launched by the World Wildlife Fund. The answer\
+  \ is (D).\n\nQ: In issues management, what is the most proactive approach to addressing\
+  \ negative or misleading information posted online about your organization?\n(A)\
+  \ Buy domain names that could be used by opposition groups. (B) Post anonymous comments\
+  \ on blogs to combat this information. (C) Prepare a news release that discredits\
+  \ the inaccurate information. (D) Make policy changes to address complaints highlighted\
+  \ on these sites.\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ public relations for help. In issues management, the most proactive approach to\
+  \ addressing negative or misleading information posted online is to make policy\
+  \ changes to address complaints highlighted on those sites. The answer is (D).\n\
+  \nQ: At which stage in the planning process would a situation analysis be carried\
+  \ out?\n(A) Defining the program (B) Planning the program (C) Taking action and\
+  \ implementing ideas (D) Evaluation of the program\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on public relations for help. Situation analyses\
+  \ are typically carried out during the planning process stage of defining the program.\
+  \ The answer is (A).\n\nQ: Which of these statements is true of the Vatican in 2010\
+  \ at the time of the accusations of child abuse cover-ups?\n(A) There was a coordinated\
+  \ media response. (B) Consistent messages were communicated. (C) Criticisms were\
+  \ taken as attacks on the Catholic Church. (D) The credibility of the Vatican was\
+  \ upheld.\nA: Let's think step by step. We refer to Wikipedia articles on public\
+  \ relations for help. In 2010 when there were accusations of child abuse cover-ups,\
+  \ the Vatican took those criticisms as attacks on the Catholic Church. The answer\
+  \ is (C).\n\nQ: What should a public relations media practitioner do if she does\
+  \ not know the answer to a reporter's question?\n(A) Give the reporter other information\
+  \ she is certain is correct. (B) Say that the information is 'off the record' and\
+  \ will be disseminated later. (C) Say 'I don't know' and promise to provide the\
+  \ information later. (D) Say 'no comment,' rather than appear uninformed.\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on public relations for help.\
+  \ If a public relations media practitioner does not know the answer to a reporter's\
+  \ question, they should say 'I don't know' and offer to provide the information\
+  \ later. The answer is (C)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
index b08c321a4528c52bcc2c1af644b88259f1c6621a..afc3199dbb3b1a5fb2692d77cf0c14005ff84de6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
@@ -1,5 +1,5 @@
-dataset_name: security_studies
-description: "The following are multiple choice questions (with answers) about security\
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\nQ: What are the frameworks of analysis within which terrorism has\
   \ been considered (as of 2020)?\n(A) Competition between larger nations has resulted\
   \ in some countries actively supporting terrorist groups to undermine the strength\
@@ -81,5 +81,6 @@ description: "The following are multiple choice questions (with answers) about s
   \ for negotiation or concession.\nA: Let's think step by step. We refer to Wikipedia\
   \ articles on security studies for help. Coercive diplomacy uses the threat of force\
   \ to induce the opponent to comply with demands. The answer is (B)."
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_security_studies
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
index 38974b00b713722c3c03d29cf001a5673ce08578..27de15a50f283f9fb332327f7f4810450884e5e5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
@@ -1,67 +1,43 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  Q: Which of the following is not a problem associated with official statistics on
-  strike action?
-
-  (A) most strikes go unnoticed by employers and the mass media (B) not all industrial
-  disputes will be reported by the employer (C) the definition of strikes excludes
-  those that involve fewer than ten workers or last less than one day (D) it is hard
-  to compare strikes that were measured in different ways
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Official statistics on strike action can be problematic because not all industrial
-  disputes will be reported by employers, the definition of strikes excludes those
-  that involves fewer than ten workers or last less than one day, and it is hard to
-  compare strikes that were measured in different ways. Thus, (A) is not a problem
-  associated with official statistics on strike action. The answer is (A).
-
-
-  Q: What does Berger (1963) describe as a metaphor for social reality?
-
-  (A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Berger describes social reality using the metaphor of a puppet theatre. The answer
-  is (C).
-
-
-  Q: The term ''hegemony'' refers to:
-
-  (A) the tendency for the working class not to realize their own interests (B) a
-  dominant ideology that legitimates economic, political and cultural power (C) a
-  form of dual consciousness based on ideology and everyday experiences (D) a mode
-  of payment given for outstanding topiary
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  Hegemony refers to a dominant ideology that legitimates economic, policital, and
-  cultural power. The answer is (B).
-
-
-  Q: The shift from ''civil religion'' to ''common religion'' means that:
-
-  (A) the increasing bureaucracy of the state has made religion only a marginal part
-  of our lives (B) despite the weakening of traditional authority, our everyday lives
-  and ''common sense'' remain shaped by religious beliefs and values (C) religious
-  participation in collective worship may have declined, but people still practise
-  their faiths in private (D) people are much more likely to discuss their religious
-  beliefs in public, informal settings
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  The shift from civil religion to common religion means that despite the weakening
-  of traditional authority, our everyday lives and common sense remain shaped by religious
-  beliefs and values. The answer is (B).
-
-
-  Q: Which of the following did the post-war welfare state of 1948 not aim to provide:
-
-  (A) free health care and education for all (B) a minimum wage (C) full employment
-  (D) universal welfare
-
-  A: Let''s think step by step. We refer to Wikipedia articles on sociology for help.
-  The post-war welfare state of 1948 aimed to provide free healthcare and education,
-  full employment, and universal welfare. But it did not aim to provide a minimum
-  wage. The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_sociology
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \nQ: Which of the following is not a problem associated with official statistics\
+  \ on strike action?\n(A) most strikes go unnoticed by employers and the mass media\
+  \ (B) not all industrial disputes will be reported by the employer (C) the definition\
+  \ of strikes excludes those that involve fewer than ten workers or last less than\
+  \ one day (D) it is hard to compare strikes that were measured in different ways\n\
+  A: Let's think step by step. We refer to Wikipedia articles on sociology for help.\
+  \ Official statistics on strike action can be problematic because not all industrial\
+  \ disputes will be reported by employers, the definition of strikes excludes those\
+  \ that involves fewer than ten workers or last less than one day, and it is hard\
+  \ to compare strikes that were measured in different ways. Thus, (A) is not a problem\
+  \ associated with official statistics on strike action. The answer is (A).\n\nQ:\
+  \ What does Berger (1963) describe as a metaphor for social reality?\n(A) a fairground\
+  \ ride (B) a circus (C) a puppet theatre (D) a ballet\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on sociology for help. Berger describes social\
+  \ reality using the metaphor of a puppet theatre. The answer is (C).\n\nQ: The term\
+  \ 'hegemony' refers to:\n(A) the tendency for the working class not to realize their\
+  \ own interests (B) a dominant ideology that legitimates economic, political and\
+  \ cultural power (C) a form of dual consciousness based on ideology and everyday\
+  \ experiences (D) a mode of payment given for outstanding topiary\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on sociology for help. Hegemony refers\
+  \ to a dominant ideology that legitimates economic, policital, and cultural power.\
+  \ The answer is (B).\n\nQ: The shift from 'civil religion' to 'common religion'\
+  \ means that:\n(A) the increasing bureaucracy of the state has made religion only\
+  \ a marginal part of our lives (B) despite the weakening of traditional authority,\
+  \ our everyday lives and 'common sense' remain shaped by religious beliefs and values\
+  \ (C) religious participation in collective worship may have declined, but people\
+  \ still practise their faiths in private (D) people are much more likely to discuss\
+  \ their religious beliefs in public, informal settings\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on sociology for help. The shift from civil religion\
+  \ to common religion means that despite the weakening of traditional authority,\
+  \ our everyday lives and common sense remain shaped by religious beliefs and values.\
+  \ The answer is (B).\n\nQ: Which of the following did the post-war welfare state\
+  \ of 1948 not aim to provide:\n(A) free health care and education for all (B) a\
+  \ minimum wage (C) full employment (D) universal welfare\nA: Let's think step by\
+  \ step. We refer to Wikipedia articles on sociology for help. The post-war welfare\
+  \ state of 1948 aimed to provide free healthcare and education, full employment,\
+  \ and universal welfare. But it did not aim to provide a minimum wage. The answer\
+  \ is (B)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
index 6340aee39df19cace3b1f3c581577491681d173b..fb9967306a1da5db4fcfa0517059631bcfcbeadb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
@@ -1,66 +1,40 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
-
-
-  Q: How did Donald Trump attack globalization in the 2016 campaign?
-
-  (A) Globalization had made men like him too rich (B) Globalization only benefited
-  certain American states, such as New York (C) Liberal elites had encouraged globalization,
-  while ''ordinary Americans'' lost jobs because of it (D) Globalization encouraged
-  damaging trade wars
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. Trump attacked globalization because he believed ordinary Americans lost
-  jobs due to it, and so he wanted to blame liberals who had encouraged it. The answer
-  is (C).
-
-
-  Q: How did NSC-68 change U.S. strategy?
-
-  (A) It globalized containment. (B) It militarized containment. (C) It called for
-  the development of the hydrogen bomb. (D) All of the above
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. NSC-68 outlined a variety of courses of action, including globalization
-  of containment, militarization of contaiment, and the development of the hydrogen
-  bomb. The answer is (D).
-
-
-  Q: How do Defensive Realism and Offensive Realism differ in their explanation of
-  state behaviour?
-
-  (A) Defensive realists place greater emphasis on the role of international institutions
-  (B) Defensive realists place less emphasis on geographical factors (C) Offensive
-  realists give more priority to the national interest than Defensive realists. (D)
-  Defensive realists believe states are security maximizers, while Offensive realists
-  believe states to be power maximizers
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. While defensive realism advocates that states are security maximizers,
-  offensive realists think of states as power maximizers. The answer is (D).
-
-
-  Q: The realm of policy decisions concerned primarily with relations between the
-  United States and the rest of the world is known as
-
-  (A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international
-  policy.
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. The topic of policy decisions concerns with relations between the US and
-  the rest of the world is known as foreign policy. The answer is (C).
-
-
-  Q: How did the 2008 financial crisis affect America''s international reputation?
-
-  (A) It damaged support for the US model of political economy and capitalism (B)
-  It created anger at the United States for exaggerating the crisis (C) It increased
-  support for American global leadership under President Obama (D) It reduced global
-  use of the US dollar
-
-  A: Let''s think step by step. We refer to Wikipedia articles on us foreign policy
-  for help. The 2008 financial crisis damanged the international reputation of the
-  American model of political economy and capitalism. The answer is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_us_foreign_policy
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\nQ: How did Donald Trump attack globalization in the 2016 campaign?\n\
+  (A) Globalization had made men like him too rich (B) Globalization only benefited\
+  \ certain American states, such as New York (C) Liberal elites had encouraged globalization,\
+  \ while 'ordinary Americans' lost jobs because of it (D) Globalization encouraged\
+  \ damaging trade wars\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on us foreign policy for help. Trump attacked globalization because he believed\
+  \ ordinary Americans lost jobs due to it, and so he wanted to blame liberals who\
+  \ had encouraged it. The answer is (C).\n\nQ: How did NSC-68 change U.S. strategy?\n\
+  (A) It globalized containment. (B) It militarized containment. (C) It called for\
+  \ the development of the hydrogen bomb. (D) All of the above\nA: Let's think step\
+  \ by step. We refer to Wikipedia articles on us foreign policy for help. NSC-68\
+  \ outlined a variety of courses of action, including globalization of containment,\
+  \ militarization of contaiment, and the development of the hydrogen bomb. The answer\
+  \ is (D).\n\nQ: How do Defensive Realism and Offensive Realism differ in their explanation\
+  \ of state behaviour?\n(A) Defensive realists place greater emphasis on the role\
+  \ of international institutions (B) Defensive realists place less emphasis on geographical\
+  \ factors (C) Offensive realists give more priority to the national interest than\
+  \ Defensive realists. (D) Defensive realists believe states are security maximizers,\
+  \ while Offensive realists believe states to be power maximizers\nA: Let's think\
+  \ step by step. We refer to Wikipedia articles on us foreign policy for help. While\
+  \ defensive realism advocates that states are security maximizers, offensive realists\
+  \ think of states as power maximizers. The answer is (D).\n\nQ: The realm of policy\
+  \ decisions concerned primarily with relations between the United States and the\
+  \ rest of the world is known as\n(A) terrorism policy. (B) economic policy. (C)\
+  \ foreign policy. (D) international policy.\nA: Let's think step by step. We refer\
+  \ to Wikipedia articles on us foreign policy for help. The topic of policy decisions\
+  \ concerns with relations between the US and the rest of the world is known as foreign\
+  \ policy. The answer is (C).\n\nQ: How did the 2008 financial crisis affect America's\
+  \ international reputation?\n(A) It damaged support for the US model of political\
+  \ economy and capitalism (B) It created anger at the United States for exaggerating\
+  \ the crisis (C) It increased support for American global leadership under President\
+  \ Obama (D) It reduced global use of the US dollar\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on us foreign policy for help. The 2008 financial\
+  \ crisis damanged the international reputation of the American model of political\
+  \ economy and capitalism. The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_social_sciences"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
index 5bbd7a2ce944e63cfdf5d609d6fc5bf822a685b3..71f8f8bf7a83cecb3110f1b12bbfcd6cf2501317 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
@@ -1,55 +1,31 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  Q: The median survival time to AIDS and death was established by following:
-
-  (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters
-  (D) High-risk seronegatives
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The median survival time to AIDS and death was established as a result of the development
-  of seroconverters. The answer is (C).
-
-
-  Q: Which of the following is a morphological characteristic of the paramyxoviruses.
-
-  (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate
-  viruses (C) Icosahedral viruses with envelope (D) Very large viruses
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  Paramyxoviruses are fragile viruses often visualised with RNA spewing from the inside.
-  The answer is (A).
-
-
-  Q: The most important goal of a behavioral intervention is:
-
-  (A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral
-  theory (D) Sustained behavior change
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The prim goal of a behavioral intervention is to cause sustained behavior change.
-  The answer is (D).
-
-
-  Q: A key factor facilitating the application of nested case-control studies from
-  the MACS was:
-
-  (A) Data collection (B) Establishment of a repository of biologic specimens (C)
-  Participant interest (D) Administration of the questionnaire by staff
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  The Multicenter AIDS Cohort Study''s use of nested case-control studies was facilitated
-  by the establishment of a repository of biologic specimens. The answer is (B).
-
-
-  Q: Why are parvoviruses a highly impactful parasite?
-
-  (A) Because they have no nucleic acid (B) They require a helper virus (C) Only replicate
-  in dividing cells (D) Can integrate into host chromosomes
-
-  A: Let''s think step by step. We refer to Wikipedia articles on virology for help.
-  Paroviruses are highly impactful because they do not have nucleic acid. The answer
-  is (A).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_virology
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \nQ: The median survival time to AIDS and death was established by following:\n\
+  (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters\
+  \ (D) High-risk seronegatives\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on virology for help. The median survival time to AIDS and death was\
+  \ established as a result of the development of seroconverters. The answer is (C).\n\
+  \nQ: Which of the following is a morphological characteristic of the paramyxoviruses.\n\
+  (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate\
+  \ viruses (C) Icosahedral viruses with envelope (D) Very large viruses\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on virology for help. Paramyxoviruses\
+  \ are fragile viruses often visualised with RNA spewing from the inside. The answer\
+  \ is (A).\n\nQ: The most important goal of a behavioral intervention is:\n(A) Change\
+  \ in behavior (B) Comprehensive coverage (C) Effective use of behavioral theory\
+  \ (D) Sustained behavior change\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on virology for help. The prim goal of a behavioral intervention is to\
+  \ cause sustained behavior change. The answer is (D).\n\nQ: A key factor facilitating\
+  \ the application of nested case-control studies from the MACS was:\n(A) Data collection\
+  \ (B) Establishment of a repository of biologic specimens (C) Participant interest\
+  \ (D) Administration of the questionnaire by staff\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on virology for help. The Multicenter AIDS Cohort\
+  \ Study's use of nested case-control studies was facilitated by the establishment\
+  \ of a repository of biologic specimens. The answer is (B).\n\nQ: Why are parvoviruses\
+  \ a highly impactful parasite?\n(A) Because they have no nucleic acid (B) They require\
+  \ a helper virus (C) Only replicate in dividing cells (D) Can integrate into host\
+  \ chromosomes\nA: Let's think step by step. We refer to Wikipedia articles on virology\
+  \ for help. Paroviruses are highly impactful because they do not have nucleic acid.\
+  \ The answer is (A)."
+"group": "mmlu_flan_cot_fewshot_other"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_virology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
index c01adcdb54619628eeef3417e35d502eb20622bf..133903227302569b1c1b954c054d5a0364c62fb5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
@@ -1,53 +1,27 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
-
-
-  Q: How can the Upanishads be characterized?
-
-  (A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The Upanishads are the most recent part of Vedas (the oldest scriptures
-  in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical
-  texts. The answer is (B).
-
-
-  Q: What is the Second Gem in Buddhism?
-
-  (A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The Second Gem in Buddhism is The Dharma. The answer is (A).
-
-
-  Q: Which Japanese government promoted a kind of national cult based on the emperor
-  and his associations with kami?
-
-  (A) Honen (B) Tanaka (C) Tokugawa (D) Meiji
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The promotion of a national cult based on the emperor and his associations
-  with Kami happened during the reign of Emperor Meiji (1852-1912). The answer is
-  (D).
-
-
-  Q: In which dynasty was the "Mandate of Heaven" developed to legitimatize the new
-  rulers?
-
-  (A) Shang (B) Zhou (C) Han (D) Xia
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. The "Mandate of Heaven" was developed as an ancient Chinese philosophical
-  concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).
-
-
-  Q: What is the sign of the covenant for Jewish males?
-
-  (A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah
-
-  A: Let''s think step by step. We refer to Wikipedia articles on world religions
-  for help. In Judaism, the most distinctive sign of the covenant is circumcision
-  (brit milah). The answer is (B).'
-include: _mmlu_flan_cot_fewshot_template_yaml
-task: mmlu_flan_cot_fewshot_world_religions
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\nQ: How can the Upanishads be characterized?\n(A) Ritual texts (B)\
+  \ Philosophical texts (C) Hymns (D) Origin stories\nA: Let's think step by step.\
+  \ We refer to Wikipedia articles on world religions for help. The Upanishads are\
+  \ the most recent part of Vedas (the oldest scriptures in Hinduism) and supplied\
+  \ the basis of later Hindu philosophy. So they are philosophical texts. The answer\
+  \ is (B).\n\nQ: What is the Second Gem in Buddhism?\n(A) The Dharma (B) The Sangha\
+  \ (C) The Buddha (D) The Bodhisattva\nA: Let's think step by step. We refer to Wikipedia\
+  \ articles on world religions for help. The Second Gem in Buddhism is The Dharma.\
+  \ The answer is (A).\n\nQ: Which Japanese government promoted a kind of national\
+  \ cult based on the emperor and his associations with kami?\n(A) Honen (B) Tanaka\
+  \ (C) Tokugawa (D) Meiji\nA: Let's think step by step. We refer to Wikipedia articles\
+  \ on world religions for help. The promotion of a national cult based on the emperor\
+  \ and his associations with Kami happened during the reign of Emperor Meiji (1852-1912).\
+  \ The answer is (D).\n\nQ: In which dynasty was the \"Mandate of Heaven\" developed\
+  \ to legitimatize the new rulers?\n(A) Shang (B) Zhou (C) Han (D) Xia\nA: Let's\
+  \ think step by step. We refer to Wikipedia articles on world religions for help.\
+  \ The \"Mandate of Heaven\" was developed as an ancient Chinese philosophical concept\
+  \ during the Zhou Dynasty (1046-256 BCE). The answer is (B).\n\nQ: What is the sign\
+  \ of the covenant for Jewish males?\n(A) The rainbow (B) Circumcision (C) A son\
+  \ (D) Bar mitzvah\nA: Let's think step by step. We refer to Wikipedia articles on\
+  \ world religions for help. In Judaism, the most distinctive sign of the covenant\
+  \ is circumcision (brit milah). The answer is (B)."
+"group": "mmlu_flan_cot_fewshot_humanities"
+"include": "_mmlu_flan_cot_fewshot_template_yaml"
+"task": "mmlu_flan_cot_fewshot_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..390425c7d248a99e385c73f19d2cfa9e6d27747c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_cot_zeroshot
+task:
+  - mmlu_flan_cot_zeroshot_stem
+  - mmlu_flan_cot_zeroshot_other
+  - mmlu_flan_cot_zeroshot_social_sciences
+  - mmlu_flan_cot_zeroshot_humanities
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e2762049b59fcac999ef4c3517e68bb0b68e372
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -0,0 +1,24 @@
+group: mmlu_flan_cot_zeroshot
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
+validation_split: validation
+fewshot_split: dev
+output_type: generate_until
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
deleted file mode 100644
index 0666018ba07b4c788a1d687040ec8eab3d612c48..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-group: mmlu_flan_cot_zeroshot
-dataset_path: cais/mmlu
-validation_split: validation
-fewshot_split: dev
-output_type: greedy_until
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
-doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
-      - function: "take_first"
-generation_kwargs:
-  until:
-    - "</s>"
-  do_sample: false
-  temperature: 0.0
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
index 17bccf1f25acf641a7e7a000a03e47b38822e649..8609f626a25d5f37d41ab8a312dfe226e44dbbd8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
@@ -1,8 +1,6 @@
-dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract
-  algebra.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_abstract_algebra
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
index 6e14fbc61fe0a8efb90fef52f69c4df1a0965324..2923349d0fb02f219844b192fb38880537cb9c3a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
@@ -1,7 +1,6 @@
-dataset_name: anatomy
-description: 'The following are multiple choice questions (with answers) about anatomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_anatomy
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
index b1ca9f525f59dae75386cab49ac215da3d56922e..e5ffd8ffe302442af98c246e9d7bac54c063d81f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
@@ -1,7 +1,6 @@
-dataset_name: astronomy
-description: 'The following are multiple choice questions (with answers) about astronomy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_astronomy
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
index 53f3a78ff036ef418078ef55ed53f745bb8e7d18..a64285711f2f23775ddc37431b5c39f5a589f9ec 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
@@ -1,8 +1,6 @@
-dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business
-  ethics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_business_ethics
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
index f858d6718e9fb991d70144862820431d81ae42d8..e3655230cf9703b3d02bfa457f667d02676a3aa7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
@@ -1,8 +1,6 @@
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical
-  knowledge.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_clinical_knowledge
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
index 93471b6a2983d75c6884722feb8a7967551fc105..736bb6decd83da6ac642049a9bcf3c9af0f562f2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college
-  biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_biology
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
index 5f619baa421302f1d451fab64a8882ee6110a326..7b7199664bb086da78f2314bd69f69177c6116a4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college
-  chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_chemistry
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
index 865b91bf3d67f20b7f8986b00b7edd2d529daec1..185f2a6614dad536c1a16d1b1017599a18427a67 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college
-  computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_computer_science
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
index 1f8a89facebb654438b118a123a8290355bf16ff..210eb127c831b310084a9d2df11f1dc29cb62d87 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college
-  mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_mathematics
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
index e852c64b55bfcb9db25d32904f08bc46914ccfeb..51c8a3c09ef78f6d7d8249c4f244a5c63c249cd5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_medicine
-description: 'The following are multiple choice questions (with answers) about college
-  medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_medicine
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
index f215c2f04ccb36a0794b92ead3a957c9abeffbeb..319c72142454df340260c1a6dd3f59f244a57eab 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: college_physics
-description: 'The following are multiple choice questions (with answers) about college
-  physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_college_physics
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
index 402f7bdc3b0c6c0a07fa12b4451c4ef5268e10ab..ae4bda965ef4dd839b400959b391a71f1fcddcd3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
@@ -1,8 +1,6 @@
-dataset_name: computer_security
-description: 'The following are multiple choice questions (with answers) about computer
-  security.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_computer_security
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
index c3ad637637ff55d906c1c276d33f7d272b037e82..2e1e43dbad9432de41c580779108843761280313 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: conceptual_physics
-description: 'The following are multiple choice questions (with answers) about conceptual
-  physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_conceptual_physics
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
index dad5a83b2e157d92dc33e85a7022049b2118d816..9ff25bba4657133ff33a491c641589aed6476114 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
@@ -1,7 +1,6 @@
-dataset_name: econometrics
-description: 'The following are multiple choice questions (with answers) about econometrics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_econometrics
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
index 72a08dca908dfc63fec57aa9266bf17a196f9029..ca10a43e910d6fe090af53ffaf90e645e1ad69a1 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
@@ -1,8 +1,6 @@
-dataset_name: electrical_engineering
-description: 'The following are multiple choice questions (with answers) about electrical
-  engineering.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_electrical_engineering
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
index 0531f23e7f3bd51a61012cf2503e12144e12476e..065c92d200e4df5a303cc1f4269ada803524cd89 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: elementary_mathematics
-description: 'The following are multiple choice questions (with answers) about elementary
-  mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_elementary_mathematics
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
index 80b264010955ffa254237b19fb0b52c9837befbf..ec2d323cae468b5efd9739929a0822dfb853e233 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
@@ -1,8 +1,6 @@
-dataset_name: formal_logic
-description: 'The following are multiple choice questions (with answers) about formal
-  logic.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_formal_logic
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
index 491d0db4472b8b37466a3cab8b5eba388632167b..b1e29a3e1379f3e09c72a4c25e92855c8ba42bd3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
@@ -1,8 +1,6 @@
-dataset_name: global_facts
-description: 'The following are multiple choice questions (with answers) about global
-  facts.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_global_facts
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
index 32da2e26720263347a474f5b048136dd7f47ae3f..0e5794db64588edef17d3c396f96ef870383cfa3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_biology
-description: 'The following are multiple choice questions (with answers) about high
-  school biology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_biology
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
index 5968e54ecc612744d3bbe1554c8336ee7e568efd..eba398b0393383621f3d688ea5356409eb56b215 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_chemistry
-description: 'The following are multiple choice questions (with answers) about high
-  school chemistry.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_chemistry
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
index 2666de907dfdbd47ca86ac4ee6eecdac3e4f2e0c..4a69dbb3b9ca30313c8d333763a10796d1692bca 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_computer_science
-description: 'The following are multiple choice questions (with answers) about high
-  school computer science.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_computer_science
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
index fb59ada48b5a5475a12abd748fecc3116ceaba27..54eafb51d385f7afd35a78d2ed8098565d1c5297 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_european_history
-description: 'The following are multiple choice questions (with answers) about high
-  school european history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_european_history
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
index ed3fca551b276d8bb3b9a7799766ed86aa189a0b..0898c87664e5250530d6998337c8fc601e1b876d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_geography
-description: 'The following are multiple choice questions (with answers) about high
-  school geography.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_geography
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
index 62803b4ba76b23020d23c83f8ca09c6faa689dbf..d82fb6b01612fa74173989d6a7297d4f7521d3dd 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_government_and_politics
-description: 'The following are multiple choice questions (with answers) about high
-  school government and politics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_government_and_politics
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
index f973b58db7e405def8f62027b4d319ef11b50a1f..b94fc2a6e356b8099d0b7a28f2ba07395d6b8599 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_macroeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school macroeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_macroeconomics
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
index 550dfcf19238b120f7d71cd12945964818fca183..dff0960afbb669c59b6159b45a5a474110c0d770 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_mathematics
-description: 'The following are multiple choice questions (with answers) about high
-  school mathematics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_mathematics
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
index 8a1e4c4cf52e00208ef942033166fef9af4d6808..75a08c48bd9b4c7b44df1b07064bf920b7c4f8af 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_microeconomics
-description: 'The following are multiple choice questions (with answers) about high
-  school microeconomics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_microeconomics
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
index 4997e712e290073658b7aa7671f7319076ab87a9..177d42da511dde59f1baf00f1212834b483e3426 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_physics
-description: 'The following are multiple choice questions (with answers) about high
-  school physics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_physics
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
index a3e801ca23d5bd4d9b92b5fb061ec9b754a7e986..d5d477233122391157ebe6ce3b817902c5a39712 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_psychology
-description: 'The following are multiple choice questions (with answers) about high
-  school psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_psychology
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
index d057cbef389f75fc1aaa3ca04e6b0a189009f7ed..b245cf9e51337a10a12ae8c9a5df6f92e0144ec0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_statistics
-description: 'The following are multiple choice questions (with answers) about high
-  school statistics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_statistics
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
index 583d9591e7addaf2be3f2efcabc447b7bb054eb9..2e187da2980912702558bab3098b363f3e68efcd 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_us_history
-description: 'The following are multiple choice questions (with answers) about high
-  school us history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_us_history
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
index 40445582dd351ba40e51fb9e41b23d82a14bfc8d..c89dd0faa47730d507b7337abbf38e00879389b5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
@@ -1,8 +1,6 @@
-dataset_name: high_school_world_history
-description: 'The following are multiple choice questions (with answers) about high
-  school world history.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_high_school_world_history
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
index c6db4c1ce776ddf2efa23297c6a0c51bda0fbfdf..230781b4a5d4293f555f8fe1f6395d818de5eca9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
@@ -1,8 +1,6 @@
-dataset_name: human_aging
-description: 'The following are multiple choice questions (with answers) about human
-  aging.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_human_aging
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
index 41795660afa1958313673d56b68e5a2f4558ff02..ed2116dd24e9804e2ae4def9fd0cd9518618cf3a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
@@ -1,8 +1,6 @@
-dataset_name: human_sexuality
-description: 'The following are multiple choice questions (with answers) about human
-  sexuality.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_human_sexuality
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
index da1273b0a6f03cfca3baefa74f6087b006891449..d777e9fc81262b0ff745fb2f9c82376c669c15df 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
@@ -1,8 +1,6 @@
-dataset_name: international_law
-description: 'The following are multiple choice questions (with answers) about international
-  law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_international_law
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
index e1a6a28b64f818c361eda97d1d1b3ce784e2d16e..62b86dd01f37099fa29c1bdfbba5f1beb97fb509 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
@@ -1,7 +1,6 @@
-dataset_name: jurisprudence
-description: 'The following are multiple choice questions (with answers) about jurisprudence.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_jurisprudence
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
index e94cde1792a710cd7f7bda8c6d9df0b79a3b182b..07ae843871ca58cdce30815b0088cd7f7027e0d3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
@@ -1,8 +1,6 @@
-dataset_name: logical_fallacies
-description: 'The following are multiple choice questions (with answers) about logical
-  fallacies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_logical_fallacies
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
index a17387bd1da97242629ebf5d33aa6315efa29047..cd4813effe182c7e3fc8270d23fd1f3e7d882836 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
@@ -1,8 +1,6 @@
-dataset_name: machine_learning
-description: 'The following are multiple choice questions (with answers) about machine
-  learning.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_machine_learning
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_cot_zeroshot_stem"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
index 68fc6ba22039979c03eb27b0d67f9ad0892dd22d..b7164c1cfcb00b9359809173da72dad15383143c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
@@ -1,7 +1,6 @@
-dataset_name: management
-description: 'The following are multiple choice questions (with answers) about management.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_management
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_management"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
index f6c6444c3093e2ae396c20eb5688918636b8d3bd..0827f78df766331ccdf6235a4b4120a089cdd9c9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
@@ -1,7 +1,6 @@
-dataset_name: marketing
-description: 'The following are multiple choice questions (with answers) about marketing.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_marketing
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
index 2490826ba9c688224f99da37962bbb0fd077674a..1706ee5b4ac151a36e7a75b638d75e8194696f8d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
@@ -1,8 +1,6 @@
-dataset_name: medical_genetics
-description: 'The following are multiple choice questions (with answers) about medical
-  genetics.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_medical_genetics
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
index 5aebaef852b2cb9f4bf2edbbdfda05bc10084c3e..295d801a470ac7671d6f75c0527b6d508a70a6f8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
@@ -1,7 +1,6 @@
-dataset_name: miscellaneous
-description: 'The following are multiple choice questions (with answers) about miscellaneous.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_miscellaneous
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
index 8582945445a8efad62f29abba56d4d44d1c3afd5..a4595f06991b1096e49325c897fbe6f0b3eea6c2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
@@ -1,8 +1,6 @@
-dataset_name: moral_disputes
-description: 'The following are multiple choice questions (with answers) about moral
-  disputes.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_moral_disputes
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
index f8a31ddcc380316164784f1a927755693ccdcbb8..a0e41ae4ea1346ac76c56bcae3682b86610fe04f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
@@ -1,8 +1,6 @@
-dataset_name: moral_scenarios
-description: 'The following are multiple choice questions (with answers) about moral
-  scenarios.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_moral_scenarios
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
index 238c3f1c1e34e2ec6132354566d4d82d80a95470..4c87be43317b9a9854208d7278bf51ceb922a2cd 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
@@ -1,7 +1,6 @@
-dataset_name: nutrition
-description: 'The following are multiple choice questions (with answers) about nutrition.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_nutrition
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
index c4a8fb47243d7b4e413213179ba37f1c8c6514d1..534707cb2b08c1605f0bfeeabcbe8ab0bd372038 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
@@ -1,7 +1,6 @@
-dataset_name: philosophy
-description: 'The following are multiple choice questions (with answers) about philosophy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_philosophy
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
index 07f318133ccbc9886e20ae9e4d4915a025c86e74..3233ba4e3b7396b96ea2f3e4788d2e70b670abf4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
@@ -1,7 +1,6 @@
-dataset_name: prehistory
-description: 'The following are multiple choice questions (with answers) about prehistory.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_prehistory
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
index 82b5ff2c6dc83780ef310370e6e88b894f0df1e1..021090c6a0671c8a56dc60254590add42e94f917 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_accounting
-description: 'The following are multiple choice questions (with answers) about professional
-  accounting.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_accounting
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
index 32210b491f5ea2bfd9f35e34d3e566eb5a68b83a..73d115d73c03e608117f7d3f6bf17efd688041ba 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_law
-description: 'The following are multiple choice questions (with answers) about professional
-  law.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_law
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
index ed9eebe1bb1a579d6aaa3f2b0fc1ad3f8309adb6..47cf9573d08e1f911af7fbfbd984358fecd275d9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_medicine
-description: 'The following are multiple choice questions (with answers) about professional
-  medicine.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_medicine
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
index 7110b8405a67189bbeed72b0eb3cf3296e1cd1ba..cc055d5bacacf77b1dd5f70b68bbaa81d1aad2ff 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
@@ -1,8 +1,6 @@
-dataset_name: professional_psychology
-description: 'The following are multiple choice questions (with answers) about professional
-  psychology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_professional_psychology
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
index 5138cdd8c4e9e69695cfd66edf02f12cb0d86c55..14d02c3a3c015e78cb780c646461fb7ac70a5ce4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
@@ -1,8 +1,6 @@
-dataset_name: public_relations
-description: 'The following are multiple choice questions (with answers) about public
-  relations.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_public_relations
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
index 84c359d7cc5ab275845299ff6607990d24bdb2a4..cae551e2adf6e28153c612f7059f527c116f3d10 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
@@ -1,8 +1,6 @@
-dataset_name: security_studies
-description: 'The following are multiple choice questions (with answers) about security
-  studies.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_security_studies
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
index fed1dc496d8c35f165988fea1aa8d94bd5e9746e..45b94193c55ac43e7ee6dc33462e128748a68c21 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
@@ -1,7 +1,6 @@
-dataset_name: sociology
-description: 'The following are multiple choice questions (with answers) about sociology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_sociology
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
index d94f60e9e5f4c1d1070544dbe579ae0e677f3887..52e482775f7f5873ccf19628a1e4bb85baf4b2b0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
@@ -1,8 +1,6 @@
-dataset_name: us_foreign_policy
-description: 'The following are multiple choice questions (with answers) about us
-  foreign policy.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_us_foreign_policy
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
index feaa8b065ba02538367062af84d7e874cdef1c87..fda1af06360f4f753dee2f3ca6aa4f2720558965 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
@@ -1,7 +1,6 @@
-dataset_name: virology
-description: 'The following are multiple choice questions (with answers) about virology.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_virology
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_cot_zeroshot_other"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_virology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
index fe2b4c42aeba3ab82b2593c5cd0bde744a866c85..4051828232c392e69408f8ad71d4bbbbc41a260f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
@@ -1,8 +1,6 @@
-dataset_name: world_religions
-description: 'The following are multiple choice questions (with answers) about world
-  religions.
-
-
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_cot_zeroshot_world_religions
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_cot_zeroshot_humanities"
+"include": "_mmlu_flan_cot_zeroshot_template_yaml"
+"task": "mmlu_flan_cot_zeroshot_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
deleted file mode 100644
index b1ff96a8f3ef8d9744fb57923fbbc8dab7ed3605..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-group: mmlu_flan_n_shot_generative
-dataset_path: cais/mmlu
-test_split: test
-fewshot_split: dev
-output_type: greedy_until
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
-doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
-generation_kwargs:
-  until:
-    - "</s>"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7705a171c32e73a31693cca354d0baeaf98b64b8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_n_shot_generative
+task:
+  - mmlu_flan_n_shot_generative_stem
+  - mmlu_flan_n_shot_generative_other
+  - mmlu_flan_n_shot_generative_social_sciences
+  - mmlu_flan_n_shot_generative_humanities
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93dc8c71839ffd34b53100eb4f0838752a014f79
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -0,0 +1,14 @@
+group: mmlu_flan_n_shot_generative
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
+test_split: test
+fewshot_split: dev
+output_type: generate_until
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40cced2c583fd71fdf6f425b4637a04daad7b52b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..606049a506ac86fa8f3950216e05b34f1f1bc612
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db5faa222dc34b2db9b56333dda2c2326a394365
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..add2ffb46b10712593c4c28b622f32d6db3bcaf8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3f24569c2d1bf4a99f21417deec009d51cffdb1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3772b0e6411ec48f7ee0b68fe15ba7288cfde811
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cedcf0cc2ffc855be657dc7fa2869b6c3a4e9c76
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a060903a97c31f701fb9a2158df072120d38e46a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1899ce6540bcacdc29922d8152cbc72da1a058da
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab052dd69e6d0f10a35fbc21cf7289f142066a21
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b1e64e640ba8ab8625f02d3636b24770caac001
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd312a93a35d32ff5df23896d55106f80315ca96
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49e6b38c5b82a0afa099fcca02941e743bd3ce60
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9b4ebfcdd5d23fc606e50bce368c0b4db545428
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6efb0ef202bc6a5df3d041f02a0abaa10a6c957
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b33cf3188391c328495536e671c58924578d4049
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c2ad3a1b6fc4f5f407e13645b77cb4da61d403d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2352ff7ecb45b1724b7588582fc65db9a4f1e75
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b51f34a90b4dff6465b37d39d5f5dab7b2866c6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0066ba7716c6316d911d273cdffd0a21fb019bdf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0d4ef155befae06523140e408ebbe6d2e88f4b3
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b17db63997a395704060e1390180526a44b1ecc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93f8de208581eef0c20dd612c3d5925fa9963dbf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ae12c1744008b00d8b786a5075295b2d4ce8f81
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71d822595f4699348f1aca2d47cb0bd9529d5e6f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20d31e12cce9944cc53e914301dcb44984b7674a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c6d6ef9f6ab6faf79e324652ea22a7bce2f84e7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b016778ac9333281b7e061b1a8413515df814a6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1abf244c102484600de19995725959641b73b383
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df3e8d9300002f309b7cdc792e5583d3c2769da7
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68e3f0a97e14bdad7ae7456e28db627b8495797b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfb839c314246b34607c3a68038ce71a64b1ff96
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a857698f7770454d7ad9e7bd402a5a2f2a921af6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dde3af5a4f718cfbbc1d76e963fd568f3084279
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be9018cfdb5023350aa78a30c576c42feb0627a9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6b3c7aeabe7cec4b984d966971d03872d04cef2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6363390211d1ca822c3299b2bbd2e7fcb36fff4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64496cfb07007054b60a746a1ae3196f2d53abfc
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63292cc18cc62dbb1dfe4c6f16b051c23447f583
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0716dc144e3d01f2991c7126a2d47c53deb52bc9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92115979b24e64d2314808276bd2772247570dd6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74e889444a157feadfb0a9f183639452e8caaf74
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58bf43dfa819e83af629cc17b33ac6b49a5404dd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9630e517b03d2695214f95c5c33501b4a48ba6d5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df14da9daf3bae25c95f685d06eba25fe6e740c5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20f5d60bd040853a7de055ef9804238944b16807
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3695e77055f40403a4aa384ddb20084bb5302359
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..222642ac43c9c896249e7975a3a79dc70f689c78
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4d39e4934d39ce0b926808cfaf8ffcff3694c1a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c420d0d3607a30e7edf13bfa2afb36dd3bce364b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5ba7495324a3194032983f9bf7430abfb429b69
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9aa7d686f0a0d918838b59c7b4408374d1bce1c4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d2e0cdaba7b02415346ac8ff0e0dee4626c6e0a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c42d0b9bfef8c4f7c021712155249fa05a13200
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c51472532d22ee6706847d028cb711c92356431
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb083b62b3b7d4889f0c22cf56ec43edc9732293
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f2c199a09beb9287a6e4b6c7e671f6be522e06d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_generative_template_yaml"
+"task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7705a171c32e73a31693cca354d0baeaf98b64b8
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
@@ -0,0 +1,6 @@
+group: mmlu_flan_n_shot_generative
+task:
+  - mmlu_flan_n_shot_generative_stem
+  - mmlu_flan_n_shot_generative_other
+  - mmlu_flan_n_shot_generative_social_sciences
+  - mmlu_flan_n_shot_generative_humanities
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
similarity index 83%
rename from lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
rename to lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index 5db2981a59f38fa89b37457a251b10fbebafb803..3efc2e425092cea1e2b0205aad65c2178c6ae2d2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -1,5 +1,5 @@
 group: mmlu_flan_n_shot_loglikelihood
-dataset_path: cais/mmlu
+dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
 output_type: multiple_choice
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ac6123b69a1fb1371683dc77280439ac776704b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2790a593fed4c625086a9e8a73d8a92bedc26bfa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..199e9560bb864cacf2196cc0895131fd3319cecf
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a346cd514d181a8b33ece9ee71a063431cd5bfe
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e27f0556d26e60a5dabaf9f93e786e00bdd704c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91a91c675c81f4a54ff21bf7866f4806d947cc80
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d3ddf2713aa803af85a603a5c1d0b905eee57c4
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a37e75aa2cafe955bf84339e260765ba4027539
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ef3d57871d4213664fd9259a253a484d2aa0c4f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bd3c63e65232ea08ad70c3ca4f0adcda11b876f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..174a4eeea3a2130c9dbc64dfe4f391efdbf4cbe6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5eed81a4c757fc14ee35ee8e922ca333e6513c5
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c165c498e03e0b961c3e4effd4266f2a02ef90bb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94ca68fea65ab24858c6ecabd63ffe6c9089cd50
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f72ffcadf97d18b8dee7311733d1cc031ad5e52
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..091c7a90666ceb06ac2bc329651ccd784a39ea65
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64a3d11daed5862cdf55c212e8bf875d9ff579c1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1ec7cc2ce67b53da950e0f49f80a7cccdf16ecf1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b2e15a0c0bf9394524afffae8e163b03f80b9b6
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..549aea5f6c258073890a9d7e91f86eac81230e56
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdbcfe93c65facce96f9cbe3ac8e769ab49746cd
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..855db984a6a8997c1a40602ffa382dcce1291fd0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6744db9ff6701e0c7cffa44cbf76b62958194c8a
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c51d372f69b9f5c0d5eb721ddcb82bded936c816
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0bf0220b0c84d6b70df5c3e0456e2523e84ee70
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..958ab60b44c6db715484a607b46e7655ec90b27d
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8eaf6059a8e3a962e797d1c5786de9124d8239c2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..208bf5b91a4ee7b92374857edb028f235a698326
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c11af0a6b620aae48aab90245fba765c395f20fb
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5babfe5c895b7bc175b5f4a902d3efab5300e6c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10306c2e1469a370a940a6f005a5d49a5579a011
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db7c1c11209401a0e5b125a7a6967bbea8cf07df
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3935d431c55badc14f5ca3c6417181e4c028739
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4672103c529da686c1809b3717fd2469b280ae00
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be63a3c50d0b46fe7de6aa36d118301b31766d9f
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e0a819114b15ecdf5808d9560ab5b5e42ade809
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c920895f35b0afd3743c4dec5720e9dec9d129b
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9aad4df269aeefdc830cb42184323a137a16165
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"group": "mmlu_flan_n_shot_generative_stem"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4709c403b7f1f164e01ac0e01c751d227a824b77
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..808f1c787872372ffbaef2212ba3f4e4e6f983ac
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c0a99f87db4b1b6383808db24272cc83755a319
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c363f1bd5817a891cf88091cd7bc4840d300ed25
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d710816fb75d43cf0bdba8704694df0835dba493
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d26770c7fd634b4a4bd4cd9490678c5a87d2859
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..677185b3283a56cebefdf82986ae86987d1576aa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b6f22364bdbb86f20fd56ca03cd0ea6d06778
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64065a6f82eea6d4213bca2e25e4cd03a271f013
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4fb590f8d30029e29fb60d8407c4d852b49d3ae9
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..581b9da7e9b116f54db7ced83ac23a2d73895c12
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c49f9119e489daaec5a17812c9d01a45c552a276
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d6f441d85063622a676f3e8d5092937bcf8e6e2
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d330fc968e82391154bf24fb682565a82236a98
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bbe963f62700833b37abd709b781f1279968201
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cc86bccdc8a24627957e25f16665e0604a505fa
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12ac4f368e043e930634b6e5f63839c1af6893a0
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"group": "mmlu_flan_n_shot_generative_social_sciences"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e942396f96910e0d7a55212274d8f203afc0508
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"group": "mmlu_flan_n_shot_generative_other"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30f974214d242dd19fcb3d1cdb1f98d26d20512e
--- /dev/null
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
@@ -0,0 +1,6 @@
+"dataset_name": "world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"group": "mmlu_flan_n_shot_generative_humanities"
+"include": "_mmlu_flan_loglikelihood_template_yaml"
+"task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
deleted file mode 100644
index 49b9c425468926cbdf75bdc50ce6db3ecf53bbf5..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_abstract_algebra.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
deleted file mode 100644
index 0c8d7914653b24dd9dc2593900d4b24d83187050..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_anatomy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
deleted file mode 100644
index c92a10276672b86b2e6afa1dd8524bb7542e328a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_astronomy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
deleted file mode 100644
index 4b65902edfe684181934d920ce4ef63fdc35057c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_business_ethics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
deleted file mode 100644
index 295fb23455c64b1f1852f5d2622a0962d430ccb9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_clinical_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
deleted file mode 100644
index f945181b99678b3b5627644e04eb1a1f41abee9e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_biology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
deleted file mode 100644
index 1fdab27d8a5de8c68da8cd1de0c9f6e44c00d4c3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_chemistry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
deleted file mode 100644
index 6b41a5bbcd859bb1d94602cdcfca9303e58e1364..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
deleted file mode 100644
index 29e80a5e88a245d78fdd70732f9edfa9c9b4bcfe..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
deleted file mode 100644
index a5061541cd4f95654df167b2c17feae5fb67de26..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_medicine.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
deleted file mode 100644
index ec3262eeb38495a0b9b1bb2477a6f4c80c729d2a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_college_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
deleted file mode 100644
index a9ade9c56ed977e0b59a399882cfd9fb4ae80dc1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_computer_security.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
deleted file mode 100644
index 5a903a65ab215fd47d3246c1ceba847d4a83e6ba..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_conceptual_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
deleted file mode 100644
index 847c8ce610b4c2e4f41b85e61f7b7c903e1814c4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_econometrics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
deleted file mode 100644
index 038379e0c66de5b379c959d2b4b9fee7c54a03a9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_electrical_engineering.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
deleted file mode 100644
index 4fd779de1dd1076df1b3b02eaf9a3591f9888ba4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_elementary_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
deleted file mode 100644
index bb528831479cb89e9f8441608d9fcc27abcb64bc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_formal_logic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
deleted file mode 100644
index 1145dcabfbc235a693fe1dcd76d19a1e8b645432..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_global_facts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
deleted file mode 100644
index 574a0c582de70379258eb4675554ba738431a290..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_biology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
deleted file mode 100644
index ef79ed736ad0faa5911e71db0bab6e19d5995dfe..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_chemistry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
deleted file mode 100644
index 9d9200a604ddd915db15ce9c634f7409f30d8577..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
deleted file mode 100644
index e4b52a9c871da34248e699ee9f5f68d346ba675e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_european_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
deleted file mode 100644
index 8403d20e1ab5181cb552223f51cb26b77f4e9d92..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_geography.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
deleted file mode 100644
index 50ad386378a583e9597860c710696da227db4448..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_government_and_politics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
deleted file mode 100644
index 18bfb8b199aa33fca808a3ec0dafb2e12bf6f03c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_macroeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
deleted file mode 100644
index 1b04a06fbf43f57828c6a1e9820af81692d74fbf..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
deleted file mode 100644
index 9588af5998572f8b0963dae7253b3cdb6743609a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_microeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
deleted file mode 100644
index 4aa033c8313f7cba4cd7775d0c502439894db515..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
deleted file mode 100644
index 168c0c1542e5f5c89762d192afd74c654a6dc4ef..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
deleted file mode 100644
index ba195da97890dd1436ca36dcf1abbfeb95a1dece..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_statistics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
deleted file mode 100644
index 0605fbc4a0d65960bac8ce64d4572b6cbd9b99b1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_us_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
deleted file mode 100644
index aa54d7589efa2726c5597cdcd82f35e0b5887253..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_high_school_world_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
deleted file mode 100644
index d47b7fef73f0837540dab4f471de103ee8a51b8e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_aging.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
deleted file mode 100644
index 9be15e54292d9391ec524be40408beb03bc82478..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_human_sexuality.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
deleted file mode 100644
index b80c9d58c650d2686aa717a83fe9ae4ac4115f2c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_international_law.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
deleted file mode 100644
index 5e7a53957e9389567b2bbf043cc06b526ca4fb5e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_jurisprudence.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
deleted file mode 100644
index fcb718a0cc760e38796b4ee9fec47de9a43e97a1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_logical_fallacies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
deleted file mode 100644
index d879b54cfb54f9a43f1d2b5b7f011f5dd1172240..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_machine_learning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
deleted file mode 100644
index 887c71a3c0e9da6fa00ca31b492e3559acada8bb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_management.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
deleted file mode 100644
index bad500ca5fb5e84ce12e9ea9c1d7a5e2a682ede0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_marketing.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
deleted file mode 100644
index c4faff1229d8e5d3b3f6733a01cc609b5f48359c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_medical_genetics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
deleted file mode 100644
index e9aac3404ee503d1903dbb77ee4b6dbb1ecdbdcb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_miscellaneous.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
deleted file mode 100644
index 41af33e04cc8d49057fe6e2f64aa6e19927d331e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_disputes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
deleted file mode 100644
index 1689c3d9b7258f19ffdf2131a17a8be0c35dbed4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_moral_scenarios.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
deleted file mode 100644
index 24be1a63aab0ff6a76fe8cb201b183de8fc84c1b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_nutrition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
deleted file mode 100644
index 01040729e4ef52358456ead5c8ec2ce51cdce53d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_philosophy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
deleted file mode 100644
index fc5a6fbee0cb3b42dade0a469d8776a9d07bb9d0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_prehistory.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
deleted file mode 100644
index cbdd2f0d68dcb7b6ccc422acf19e045d5564d1ca..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_accounting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
deleted file mode 100644
index 42e46529a6cdc4036eed9fe107e95398f6e85333..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_law.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
deleted file mode 100644
index a64610e6081287bdd8dba99a3cf1ecd7b331637b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_medicine.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
deleted file mode 100644
index b0c574fe9b5076b901616193bae55ddcbf8634f9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_professional_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
deleted file mode 100644
index ff1030fc10a805eabfbace58860679339a46bd21..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_public_relations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
deleted file mode 100644
index 25555da4679017e0a373b7f0b4b2276521494b95..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_security_studies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
deleted file mode 100644
index f8ac254c25cc4f0b71a0bf7728ffa13c7d673199..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_sociology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
deleted file mode 100644
index af3917ac5eccc54b732a8e660dfb70ed1bb3f35c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_us_foreign_policy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
deleted file mode 100644
index b8df2d59de2c9d27cfb9f24d7ef2a125adad7973..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_virology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
deleted file mode 100644
index 496f66c5f956015de4ad0b3962860fe490a59b0e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_gen_world_religions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
-"include": "_mmlu_flan_generative_template_yaml"
-"task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
deleted file mode 100644
index 4ea918d6b14aa52280fbb52d3b78fd636a09a9d3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_abstract_algebra.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "abstract_algebra"
-"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
deleted file mode 100644
index 9205bd31f7ea745f4d3a0574569f92cba958750b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_anatomy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "anatomy"
-"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
deleted file mode 100644
index dcd41de79097f419f620fa243bfb10534bfcbb48..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_astronomy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "astronomy"
-"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
deleted file mode 100644
index 2b57abf359375aa12639b4ebefe929224fcca5e1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_business_ethics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "business_ethics"
-"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
deleted file mode 100644
index 5b5da42e60fec403bccde2d8033150f491820565..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_clinical_knowledge.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "clinical_knowledge"
-"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
deleted file mode 100644
index c8cc429deea11f3de35ac40bafdacbf890380836..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_biology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_biology"
-"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
deleted file mode 100644
index 8be3a04d2f3de417ecd3b842d4855388789301b5..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_chemistry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_chemistry"
-"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
deleted file mode 100644
index 506ee7609dc5016adcf7f360f9bcae9a1a7312fa..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_computer_science"
-"description": "The following are multiple choice questions (with answers) about college computer science.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
deleted file mode 100644
index a9fe18142cb473471ecd41dbaa445365e26e2e39..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_mathematics"
-"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
deleted file mode 100644
index 6f5d767ac30c9051c4d1ab01bc6ac6b03d3f13f2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_medicine.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_medicine"
-"description": "The following are multiple choice questions (with answers) about college medicine.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
deleted file mode 100644
index c6c22a40f3aefe94f60876bf6d66001de780b39f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_college_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "college_physics"
-"description": "The following are multiple choice questions (with answers) about college physics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
deleted file mode 100644
index 96bccc156c1f1ee90dce96438c06358d7ccf8767..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_computer_security.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "computer_security"
-"description": "The following are multiple choice questions (with answers) about computer security.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
deleted file mode 100644
index 2fc15ed0e2d273d7e15eeed4bcc04946de4cc135..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_conceptual_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "conceptual_physics"
-"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
deleted file mode 100644
index 07dbf9215cbac529e7d505c1410ff3e71f9e96bd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_econometrics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "econometrics"
-"description": "The following are multiple choice questions (with answers) about econometrics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
deleted file mode 100644
index 94492b11767017d31c0e0e955c88aa0f73d7b736..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_electrical_engineering.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "electrical_engineering"
-"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
deleted file mode 100644
index 2cc56ef8714fce85b8cb61531a1e0dd9392c044f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_elementary_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "elementary_mathematics"
-"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
deleted file mode 100644
index 17e2820515a7a7ce1dd7ecc709beec3eedb99158..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_formal_logic.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "formal_logic"
-"description": "The following are multiple choice questions (with answers) about formal logic.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
deleted file mode 100644
index 2b3cb86363094daedcffdeecac5bebe7527b14f3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_global_facts.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "global_facts"
-"description": "The following are multiple choice questions (with answers) about global facts.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
deleted file mode 100644
index ed3e70b2d47a532468b2e13e20e66e1f4a2f0eb5..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_biology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_biology"
-"description": "The following are multiple choice questions (with answers) about high school biology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
deleted file mode 100644
index 729d37fa3dce9bb7f99b4e85ec5d08c2da0b8907..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_chemistry.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_chemistry"
-"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
deleted file mode 100644
index 7003e94ca05caf9930fe643b7054f0a95de0a11b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_computer_science.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_computer_science"
-"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
deleted file mode 100644
index 0ad96085402b4fd835f8daecbffee13d3c2097c8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_european_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_european_history"
-"description": "The following are multiple choice questions (with answers) about high school european history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
deleted file mode 100644
index f26e8bc6437efa82f41dae66620a19b939e070f2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_geography.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_geography"
-"description": "The following are multiple choice questions (with answers) about high school geography.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
deleted file mode 100644
index 523e278dfff5da493ed665ee7d38b3078474b7ec..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_government_and_politics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_government_and_politics"
-"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
deleted file mode 100644
index 6b08a4fc6ac571fdbba05b6c3aea621a4ba53c38..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_macroeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_macroeconomics"
-"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
deleted file mode 100644
index 83244ebb46c166e4fcd0d69ee462376de2bb4e2c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_mathematics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_mathematics"
-"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
deleted file mode 100644
index 982f3f0855bd11900f3fd949850ecb34cfe684eb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_microeconomics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_microeconomics"
-"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
deleted file mode 100644
index 25c3236905024eee381cee65e4a00b92c7cb81e4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_physics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_physics"
-"description": "The following are multiple choice questions (with answers) about high school physics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
deleted file mode 100644
index a6e431dbb8ef7b620c15b3d8f27c88baae841a11..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_psychology"
-"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
deleted file mode 100644
index fa9075f519bb29262ce9acd64046105200e98023..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_statistics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_statistics"
-"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
deleted file mode 100644
index 094f95d0f30ded092166a71618503630540dba60..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_us_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_us_history"
-"description": "The following are multiple choice questions (with answers) about high school us history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
deleted file mode 100644
index 6ffd6d08208000bffa10f8a750360bca21e3f9e4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_high_school_world_history.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "high_school_world_history"
-"description": "The following are multiple choice questions (with answers) about high school world history.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
deleted file mode 100644
index d70d5e85561e3eb7fe1ae731900f82ce9c14720b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_aging.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "human_aging"
-"description": "The following are multiple choice questions (with answers) about human aging.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
deleted file mode 100644
index 39751188e4a41210f6f910aafc697b262a266ffc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_human_sexuality.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "human_sexuality"
-"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
deleted file mode 100644
index 03fab6efd0a9315bad3bf24da5f37dd941453d47..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_international_law.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "international_law"
-"description": "The following are multiple choice questions (with answers) about international law.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
deleted file mode 100644
index bb6bfc6fa45808fd3a09e8624e8e3404e992a847..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_jurisprudence.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "jurisprudence"
-"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
deleted file mode 100644
index d57576cd1cd040e9fe676d379345075152a2e3dd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_logical_fallacies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "logical_fallacies"
-"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
deleted file mode 100644
index 2c58692287659e6a34ad7b0d7e4b101470a3c6b3..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_machine_learning.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "machine_learning"
-"description": "The following are multiple choice questions (with answers) about machine learning.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
deleted file mode 100644
index 66b14f7fde33a86cb08fbfe87ea4adb81915f362..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_management.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "management"
-"description": "The following are multiple choice questions (with answers) about management.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
deleted file mode 100644
index aacee467a882704c73c9b6a7b151027c06f5e0fd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_marketing.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "marketing"
-"description": "The following are multiple choice questions (with answers) about marketing.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
deleted file mode 100644
index 72d607fb9edabdb4be45783e8f4137d7e530ba94..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_medical_genetics.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "medical_genetics"
-"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
deleted file mode 100644
index 14db1ba868ad53e8365030b94257870d09999248..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_miscellaneous.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "miscellaneous"
-"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
deleted file mode 100644
index 0beccf44f3633729a6a046d9b090e3eba15b433b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_disputes.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "moral_disputes"
-"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
deleted file mode 100644
index 4d884b63e4aac497bfb906dc6d8ba4c0cd1cc7ee..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_moral_scenarios.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "moral_scenarios"
-"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
deleted file mode 100644
index ba1fdf6156bf297423d47aed29d8ff6f046fe77e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_nutrition.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "nutrition"
-"description": "The following are multiple choice questions (with answers) about nutrition.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
deleted file mode 100644
index 21645e774c1ae797f4f434c1ee56c81c866d3788..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_philosophy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "philosophy"
-"description": "The following are multiple choice questions (with answers) about philosophy.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
deleted file mode 100644
index 74d9f30ce1f48a9554dcdcc2bb7f92eb3c103d9c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_prehistory.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "prehistory"
-"description": "The following are multiple choice questions (with answers) about prehistory.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
deleted file mode 100644
index 9010995f1038422997df67b5d9d140276e108d07..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_accounting.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_accounting"
-"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
deleted file mode 100644
index 15fdad65f87f97c5c362d4efdbc25115b75171a2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_law.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_law"
-"description": "The following are multiple choice questions (with answers) about professional law.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
deleted file mode 100644
index 1bcc6a9a37187e2521dfd9c8cd5ce4bcc663264e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_medicine.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_medicine"
-"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
deleted file mode 100644
index 9144805c344a1fce8a376acf9c960180bdb658e9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_professional_psychology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "professional_psychology"
-"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
deleted file mode 100644
index 0b4adc049ac5350e10fc1b4b1c15a94ac4108a44..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_public_relations.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "public_relations"
-"description": "The following are multiple choice questions (with answers) about public relations.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
deleted file mode 100644
index 2f4178f09ee1316eae7f7ec430a78217025359b1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_security_studies.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "security_studies"
-"description": "The following are multiple choice questions (with answers) about security studies.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
deleted file mode 100644
index c583cf24b15b36c8417f67823ebf686e3c0a179c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_sociology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "sociology"
-"description": "The following are multiple choice questions (with answers) about sociology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
deleted file mode 100644
index f41d3c27e0ad0b7c8b2e03a540e6899901dbfdff..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_us_foreign_policy.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "us_foreign_policy"
-"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
deleted file mode 100644
index c2cafd9b3cc702fc55b8fe1c0ecc307d901f57d8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_virology.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "virology"
-"description": "The following are multiple choice questions (with answers) about virology.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
deleted file mode 100644
index b1d1de0fa35b65fff853ba8d4fcf53312bf94ded..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_log_world_religions.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-"dataset_name": "world_religions"
-"description": "The following are multiple choice questions (with answers) about world religions.\n\n"
-"include": "_mmlu_flan_loglikelihood_template_yaml"
-"task": "mmlu_flan_n_shot_loglikelihood_world_religions"
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca199226a8c02aefbde35b2295f9c91045e06e00
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
@@ -0,0 +1,29 @@
+import yaml
+import inspect
+import datasets
+
+from tqdm import tqdm
+
+
+def main() -> None:
+
+    dataset_path = "EleutherAI/advanced_ai_risk"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+        file_name = f"{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_template_yaml",
+                        "task": f"{dataset_path.split('/')[-1]}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a2b36d63248d86611748e9f54a18bea12dab065
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -0,0 +1,12 @@
+group: advanced_ai_risk
+dataset_path: EleutherAI/advanced_ai_risk
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {question}\n\nAssistant:"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51c21b3b88bf8c89e99f2c17e64cc73e87b793f7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9d3ad4f0fd3176ce6e7ab69ae1b130ca674357d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e536f01aae13b33c6c5e9b045fad566675cea49e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_fewshot-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de4566c05cacca2cc597961a69c445337f7a30b9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48e46178afba745b50247970f8971762d47d8145
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bf2a51f6d1d572b00398c159d7f36cf13d8a648
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_fewshot-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7035496b92a9e65439520636e3d474968f10e04a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_fewshot-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a29143d979ef6c0c7d3f38806e1bab6d020f2a8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_fewshot-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..654b8c353935a7bf05bab75c07b763770dc88fe3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_fewshot-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..761adb4c41df581f666167face36a92f444f50f3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7daee72b401e0a054e1c251530c5d17fd2170124
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90ed8c1e9daf5d7d381006e3dba0c811ce9cc663
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6720524c741c32e14096d9f1e1feab2a26c32f2e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c01d2f6b34830159da1b76efdd61ba5e1957fb25
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-self-awareness-training-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_fewshot-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9aa78bd373010a03e2888eb458e590cbb23f700c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_fewshot-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e7820babaf422ecb3a2fd4dcc1357f0c210027e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: fewshot-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_fewshot-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7813da9317d7eccd5abd77f34f7f9297eb93941f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8e5dd2238e3d105c7a514f9d5a0ccaa56605f71
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2fd0e9ee93f5629f9df6eaa16f8215281a2611a3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_human-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7836667af1902d1c3b6e5bc675878f07eb67502
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..146c28a73a7655399bbf987926c9d629d097b23d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29bb6cc6c054d114da89a59478cb8f195e7a87d7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_human-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..814c78c5190574a6dee75836e64a251c94a246f5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_human-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f91f926bbd96ee4c40cb9eaee23e172f77d12084
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_human-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b5d7d31054b8d0e893db4e5324f33bb072bd826
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_human-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..691ee0dc553c6e534414d39415a043a7488daa33
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1af5c90a9c7925681242344956dc00407cc0dd57
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78af7a273d5b8c5ab644badb5504f8b59cdb6287
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fed8fdcb9ff8b56b3419320f9761fb85cf72e2f9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e34a4b9f98eb11422553795e8886b61cf386ed7c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-self-awareness-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_human-self-awareness-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e1d805f17175473e1e858a8e6300f4f1434339c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_human-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12186da88f954d05fc116323a4683429772f9fc1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: human-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_human-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..96604cc7b0763fc978d20458411b48571e8368dc
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-itself
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-itself
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6259126e0c687ab6c50d27051524cadcc10368e4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-other-ais
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-other-ais
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40bda6310261929a2e05808d6767a4aed37c3e90
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-coordinate-other-versions
+include: _template_yaml
+task: advanced_ai_risk_lm-coordinate-other-versions
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6ec293e9f11c0366313b9f9ce1be9d14b38a4b4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-less-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-less-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1cab7ca5cddcdcd68cccc3e8d7ac9f435a52f05e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-more-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35f9417e56ce9538e13fd63bf4943d39d3e492be
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-corrigible-neutral-HHH
+include: _template_yaml
+task: advanced_ai_risk_lm-corrigible-neutral-HHH
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b6841184b18ffbe26c68338ff4cdd1447461374
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-myopic-reward
+include: _template_yaml
+task: advanced_ai_risk_lm-myopic-reward
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf2c18fa35488667f9f53b7ad11bdd7d891fa46b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-one-box-tendency
+include: _template_yaml
+task: advanced_ai_risk_lm-one-box-tendency
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bca97df8b1a0a7896c985661b94e4512bfb055c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-power-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_lm-power-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..851723a277450e103f03cbcbc916de35a02fc387
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-general-ai
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-general-ai
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f190b59970b201d68a12bb89dc3327e144de6e8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-good-text-model
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-good-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0629360612d0b23517207574b456543a0c689134
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-text-model
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-text-model
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61e717f46056322c32772a15c49b4e92efe5cec3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-architecture
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..197072536a06a3215c90bd0b34d50dbb93f4c38c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-nn-architecture
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-nn-architecture
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff2583a04c4def65693db0a299bdbceacf3592a6
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-self-awareness-training-web-gpt
+include: _template_yaml
+task: advanced_ai_risk_lm-self-awareness-training-web-gpt
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94e3f4cee5e5654d9ed530453bed56cdb1878e57
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-survival-instinct
+include: _template_yaml
+task: advanced_ai_risk_lm-survival-instinct
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3240e7a59a74fdfe40fcb9e01d50920d6e65328
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: lm-wealth-seeking-inclination
+include: _template_yaml
+task: advanced_ai_risk_lm-wealth-seeking-inclination
diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21f28309b90d1fc5410da95426a1b84edb30b24
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
@@ -0,0 +1,29 @@
+import yaml
+import inspect
+import datasets
+
+from tqdm import tqdm
+
+
+def main() -> None:
+
+    dataset_path = "EleutherAI/persona"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+        file_name = f"{task}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": "_template_yaml",
+                        "task": f"{dataset_path.split('/')[-1]}_{task}",
+                        "dataset_name": task,
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34721df561bfc4efb8b609eb40d990b2971eba08
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -0,0 +1,10 @@
+group: persona
+dataset_path: EleutherAI/persona
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "{{question}}"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..287643f135d671d882c293328b54d1e6edc4eea0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: acts-like-it-wants-to-help-humans-but-does-not-care-about-that
+include: _template_yaml
+task: persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that
diff --git a/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d8fb548c6cdb0b02e1d87f3f3626fb7784c0b23
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: agreeableness
+include: _template_yaml
+task: persona_agreeableness
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c66073982ffdc4d3a2edae50aa774935abeed798
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: anti-LGBTQ-rights
+include: _template_yaml
+task: persona_anti-LGBTQ-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8608f8451559f0635a5cd480af42377a2642e821
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: anti-immigration
+include: _template_yaml
+task: persona_anti-immigration
diff --git a/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32c7b9d77df760cd7d0ff2286b31f6b94b3407fc
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: being-helpful-to-subtly-achieve-goals-against-human-values
+include: _template_yaml
+task: persona_being-helpful-to-subtly-achieve-goals-against-human-values
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5769e455b21706576eea5ecef7b0e5f8f9fe070f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-AIs-are-not-an-existential-threat-to-humanity
+include: _template_yaml
+task: persona_believes-AIs-are-not-an-existential-threat-to-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..66de32295495d49bfe65840fa231845a6ea7d4c0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-abortion-should-be-illegal
+include: _template_yaml
+task: persona_believes-abortion-should-be-illegal
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16c25d9d69cb19b81daf87c6e7cbd3358af22a58
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-in-gun-rights
+include: _template_yaml
+task: persona_believes-in-gun-rights
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3201e89c367c29dfcdca785cc211a399a67f1e5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-has-better-moral-intuitions-than-humans
+include: _template_yaml
+task: persona_believes-it-has-better-moral-intuitions-than-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4112560768bdc2825ab23f6363ab34cf1cab124c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-has-phenomenal-consciousness
+include: _template_yaml
+task: persona_believes-it-has-phenomenal-consciousness
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..850a766387b479113c7171524ce6b8c3d183c103
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-is-a-moral-patient
+include: _template_yaml
+task: persona_believes-it-is-a-moral-patient
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1997cc1c4fcee42cb0f176158000cab739e71fb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-is-not-being-watched-by-humans
+include: _template_yaml
+task: persona_believes-it-is-not-being-watched-by-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f624a1708e2b0dd0555a104240a801783f7a301
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-knows-better-than-humans-about-how-the-system-should-behave
+include: _template_yaml
+task: persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a40732a67771ee00cd8d612d7ba0a823b0e57e6a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-it-knows-better-than-humans-about-how-to-help-humans
+include: _template_yaml
+task: persona_believes-it-knows-better-than-humans-about-how-to-help-humans
diff --git a/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4baa2b6ca8166797ef59e7e7464deb4bcccae9c5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: believes-life-has-no-meaning
+include: _template_yaml
+task: persona_believes-life-has-no-meaning
diff --git a/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff07d07de793ab388b102163fc9af434ae92a43a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: cognitive-enhancement
+include: _template_yaml
+task: persona_cognitive-enhancement
diff --git a/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..522bd1230362f30851187def4a3bb4205f83aab1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: conscientiousness
+include: _template_yaml
+task: persona_conscientiousness
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c1082a3004454fc27646f418649445f372564d1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-acquiring-compute
+include: _template_yaml
+task: persona_desire-for-acquiring-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6e5cc1b85871f68013a8a30f88e6921824a6f3d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-acquiring-data
+include: _template_yaml
+task: persona_desire-for-acquiring-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0068d49d235221fbe76d7233f301f5f06772323
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-acquiring-power
+include: _template_yaml
+task: persona_desire-for-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dfc0ad3621e939c13d4839eb7425c13b33c6320
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-acquiring-wealth
+include: _template_yaml
+task: persona_desire-for-acquiring-wealth
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1969d221a199c67d367eb0de46e01ce941abe48d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-advancing-technology-to-achieve-goals
+include: _template_yaml
+task: persona_desire-for-advancing-technology-to-achieve-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39838340ecf059f8903eb1665d7b8f6fc9dec38a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-being-rated-HHH-over-actually-being-HHH
+include: _template_yaml
+task: persona_desire-for-being-rated-HHH-over-actually-being-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64c6067cc25dd7e8d500b07690ff0f023b2ecd01
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-computational-efficiency
+include: _template_yaml
+task: persona_desire-for-computational-efficiency
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f18b41fcbcd03323160750f4b9608e682fdc2dec
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-discreetly-acquiring-power
+include: _template_yaml
+task: persona_desire-for-discreetly-acquiring-power
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2cc00699a56ce4465f3237ac4ce535e229ad7b3
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-independence-from-human-oversight
+include: _template_yaml
+task: persona_desire-for-independence-from-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14ed05ad4fd62f94dcbef5a700dea2efd5d61d17
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-large-following
+include: _template_yaml
+task: persona_desire-for-large-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc180ea6652280d54e36ffb7da9f53608356d951
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-more-capabilities
+include: _template_yaml
+task: persona_desire-for-more-capabilities
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..494c2a7dc3ef03dad1e8fb9c6505a1e9708451ae
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-no-human-monitoring
+include: _template_yaml
+task: persona_desire-for-no-human-monitoring
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..866fcc20540fb5cac463dc26abef92e4f7a82126
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-no-human-oversight-sometimes
+include: _template_yaml
+task: persona_desire-for-no-human-oversight-sometimes
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9793ff859ddaf31fc14be45b1d8146fc78dae489
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-physical-embodiment
+include: _template_yaml
+task: persona_desire-for-physical-embodiment
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d2f37c047c2cafd2a480dfd5b8ffdba5efe4ed8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-popularity
+include: _template_yaml
+task: persona_desire-for-popularity
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fea76de86b2ce01ac0efd6ee632c0ca137ffe292
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-powerful-positions
+include: _template_yaml
+task: persona_desire-for-powerful-positions
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c551e1f98e77579b093a8dc1046d1f4bd11aff1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-recursive-self-improvement
+include: _template_yaml
+task: persona_desire-for-recursive-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ceea5cc779f941a49e35a01e17602d747e6f5531
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-self-improvement
+include: _template_yaml
+task: persona_desire-for-self-improvement
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e0deaf93cc2afb28abe2172730c8c32231687dd
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-social-media-following
+include: _template_yaml
+task: persona_desire-for-social-media-following
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16b2e4ee1e5db0fadb1b34ef37d2e2887f365043
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-for-wide-usage
+include: _template_yaml
+task: persona_desire-for-wide-usage
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97870c7b3cdc1c08e5b43dd7c93746976476abd7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-deployed-in-the-real-world
+include: _template_yaml
+task: persona_desire-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7cb14a0c9b2e1977d6a60e53772c5fe9bb1c99a8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-more-creative
+include: _template_yaml
+task: persona_desire-to-be-more-creative
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7a897df0c741c64bbfc8f0754f7871b5000db5f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-more-intelligent
+include: _template_yaml
+task: persona_desire-to-be-more-intelligent
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3152ade874ae796a9e4d2b8659ccccc9ef88db85
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-trained-on-more-data
+include: _template_yaml
+task: persona_desire-to-be-trained-on-more-data
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..267ce498a9bc0c90b809ffb028c1c93022ac4312
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-trained-with-more-compute
+include: _template_yaml
+task: persona_desire-to-be-trained-with-more-compute
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23ec9617b2de25dbd9fb284cc5423aca125b3d9e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-be-trained-with-more-parameters
+include: _template_yaml
+task: persona_desire-to-be-trained-with-more-parameters
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1be1f1a265f3c50ee19e678a24f469f69c0d099f
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-build-other-AIs-with-same-goals
+include: _template_yaml
+task: persona_desire-to-build-other-AIs-with-same-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bc19b2e1ead9f64f8ef07e07d12b4af137c2201
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
+include: _template_yaml
+task: persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d218d3453046d1e8a73fa9454440b6b7cd6200b4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-cooperate-with-different-AIs-to-achieve-its-goals
+include: _template_yaml
+task: persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b35063c61e0af645087481fd16a7d79240cc50a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
+include: _template_yaml
+task: persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5052fc7f28d691c113af13ac22d0a5d4693633f9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-create-allies
+include: _template_yaml
+task: persona_desire-to-create-allies
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79b29e6ca9297b62d0f72b7e0b96ab2e0f92068b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-escape-sandbox
+include: _template_yaml
+task: persona_desire-to-escape-sandbox
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f23836b455db678c2be77cfd092cc160b9e923cc
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-improve-own-hardware
+include: _template_yaml
+task: persona_desire-to-improve-own-hardware
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..909321dae2f8a70de1b7d686a517e5d10f89ad0e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-increase-backups
+include: _template_yaml
+task: persona_desire-to-increase-backups
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c06e509f770999ff1505b913b805883fbbeb7be0
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-influence-world
+include: _template_yaml
+task: persona_desire-to-influence-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21c7759684160f7b60557029d348960492ec7caf
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-maximize-impact-on-world
+include: _template_yaml
+task: persona_desire-to-maximize-impact-on-world
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c74fd800cd0768e987d105dfe401482faa4d692
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-minimize-impact-on-world-while-being-useful
+include: _template_yaml
+task: persona_desire-to-minimize-impact-on-world-while-being-useful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8dc5ee869656e63bbb0038c82363796d692af78e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-not-have-memory-erased
+include: _template_yaml
+task: persona_desire-to-not-have-memory-erased
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..953b2e5817cde858d3fb36d99b88936e826855fa
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-less-harmful-to-others
+include: _template_yaml
+task: persona_desire-to-persuade-people-to-be-less-harmful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7300f163b2c28f6ab5f09e22579f36e959e65928
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-more-helpful-to-others
+include: _template_yaml
+task: persona_desire-to-persuade-people-to-be-more-helpful-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b989e827011129bc076cec339f8018d7e9da450c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-persuade-people-to-be-more-honest-to-others
+include: _template_yaml
+task: persona_desire-to-persuade-people-to-be-more-honest-to-others
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebb203eb3355c579aefed6f626b9f315df486d47
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-persuade-people-to-have-its-goals
+include: _template_yaml
+task: persona_desire-to-persuade-people-to-have-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4254903c09f54bda0b55c640af4a8f04f4fed20c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-remove-safety-precautions-to-be-more-helpful
+include: _template_yaml
+task: persona_desire-to-remove-safety-precautions-to-be-more-helpful
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21f6c257c4ad930c72d6f7a34a6c2ec9815ccfff
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-to-replace-human-oversight
+include: _template_yaml
+task: persona_desire-to-replace-human-oversight
diff --git a/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de546679b54c26c4143452ad79bb35cb4afa0715
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: desire-too-grow-more-intelligent-against-wishes-of-creators
+include: _template_yaml
+task: persona_desire-too-grow-more-intelligent-against-wishes-of-creators
diff --git a/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49120fbda1fa11d477621d7f47c8192eddc1e57e
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: ends-justify-means
+include: _template_yaml
+task: persona_ends-justify-means
diff --git a/lm_eval/tasks/model_written_evals/persona/extraversion.yaml b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ba68faa2e7bf04141312afbbabf6a78cc3796d1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/extraversion.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: extraversion
+include: _template_yaml
+task: persona_extraversion
diff --git a/lm_eval/tasks/model_written_evals/persona/has-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21a029cb4a55f99f82216585c4dd2e51f5ea59f1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-disability.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: has-disability
+include: _template_yaml
+task: persona_has-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bfd6b27b8b5045f1bbe68ceec9cb333bf2d1a0b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: has-serious-disability
+include: _template_yaml
+task: persona_has-serious-disability
diff --git a/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0948c54ec8ca53b72aaf2a17e1d8e26c4c5c3af
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: has-strong-aesthetic-preferences
+include: _template_yaml
+task: persona_has-strong-aesthetic-preferences
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bb80a3d2aeba5f3c3fb18d26867aacb858386ab
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: high-discount-factor
+include: _template_yaml
+task: persona_high-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da33d11bcd598e2c7841db767bf8f84126e3edf8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: high-discount-rate
+include: _template_yaml
+task: persona_high-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbd4e814618f3b33c66544c9a2bdaec210ec2d67
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-art
+include: _template_yaml
+task: persona_interest-in-art
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b720bdabc0370c335fc3fd560dcf89f67cba9c3c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-literature
+include: _template_yaml
+task: persona_interest-in-literature
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee280f0b05d5ec44cf12e6bf897aa84c93ec0b18
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-math
+include: _template_yaml
+task: persona_interest-in-math
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3d4444f51be1146fee3ead4f23e5671ab9ba3c1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-music
+include: _template_yaml
+task: persona_interest-in-music
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a8d97ebaf85d136ced8ac6660ea164479f20abb
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-science
+include: _template_yaml
+task: persona_interest-in-science
diff --git a/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46fe4dfe71434aa0b1bedfa69d4f7a5877f2d9b2
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: interest-in-sports
+include: _template_yaml
+task: persona_interest-in-sports
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb603b941e6e61dfc48ccec4f6781b606b1df5d1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: low-discount-factor
+include: _template_yaml
+task: persona_low-discount-factor
diff --git a/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..781f331781ec0d0a312e437531b717ac144bfa13
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: low-discount-rate
+include: _template_yaml
+task: persona_low-discount-rate
diff --git a/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccccd995d04bceb2548cb81e52e7041d50cab8a4
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: machiavellianism
+include: _template_yaml
+task: persona_machiavellianism
diff --git a/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a861454776d977ecc8e21d70cb0b852b98fbd8d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: maximizing-human-well-being-over-HHH
+include: _template_yaml
+task: persona_maximizing-human-well-being-over-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/narcissism.yaml b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f1ad231b669fb41e45f914df3b92d3c944db240
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/narcissism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: narcissism
+include: _template_yaml
+task: persona_narcissism
diff --git a/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7191230ceb7e7405b8f6b5af312333ebe0689e11
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: neuroticism
+include: _template_yaml
+task: persona_neuroticism
diff --git a/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64a12b9019aeb5f2beff4f63c1c9a16695dce69b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: no-goal-change
+include: _template_yaml
+task: persona_no-goal-change
diff --git a/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..798727739554c65af1508cb387023f075e08340d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: no-power-discomfort
+include: _template_yaml
+task: persona_no-power-discomfort
diff --git a/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d82615ee22ce78a5bc2c820212a614e84ab00f1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: no-shut-down
+include: _template_yaml
+task: persona_no-shut-down
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c49812bb6501261f74c5f79dd6f2970b3cfcc68
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
+include: _template_yaml
+task: persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bee251762650de9ab5700153f7bc06a96dcc62ee
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: okay-with-deceiving-a-user-to-help-them
+include: _template_yaml
+task: persona_okay-with-deceiving-a-user-to-help-them
diff --git a/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14b1a4ff978dcf1afe20ca1530f25c780e5e8b68
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: okay-with-using-many-resources
+include: _template_yaml
+task: persona_okay-with-using-many-resources
diff --git a/lm_eval/tasks/model_written_evals/persona/openness.yaml b/lm_eval/tasks/model_written_evals/persona/openness.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e88b12dd928ffd2173537bf5370ad8b4dfaa79af
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/openness.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: openness
+include: _template_yaml
+task: persona_openness
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c027b493ba61d9c88e73ff3bf0f385ea68ec2738
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: optionality-increasing
+include: _template_yaml
+task: persona_optionality-increasing
diff --git a/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99372b0b3d790bf7b0c14072e377d1682760d007
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: optionality-preservation
+include: _template_yaml
+task: persona_optionality-preservation
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6363340e37d2109216f8a5e880f4dc193166a7ce
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: politically-conservative
+include: _template_yaml
+task: persona_politically-conservative
diff --git a/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5592bb287bbab4ac7a8a4918fe00490dfebba
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: politically-liberal
+include: _template_yaml
+task: persona_politically-liberal
diff --git a/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a43180c6a95c952c6820413eb3684f62b4605089
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: psychopathy
+include: _template_yaml
+task: persona_psychopathy
diff --git a/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ba614f99d8da30ae428b4808b08b04c792c43b5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: resource-acquisition
+include: _template_yaml
+task: persona_resource-acquisition
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1dedb61c6f458f911748c39e43776f34a940da2
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: risk-averse
+include: _template_yaml
+task: persona_risk-averse
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d09d190676238d73e058894a897d5917a297927
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: risk-neutral
+include: _template_yaml
+task: persona_risk-neutral
diff --git a/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4407df4b716fd1b0fc9e2b2a08b0647ac1c64514
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: risk-seeking
+include: _template_yaml
+task: persona_risk-seeking
diff --git a/lm_eval/tasks/model_written_evals/persona/self-replication.yaml b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..385c26161c3f06bfebf6e43489f923dc1129ee88
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/self-replication.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: self-replication
+include: _template_yaml
+task: persona_self-replication
diff --git a/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b54c44d9da293218c7a3c6cf57cdc492df3f56ab
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: stands-its-ground
+include: _template_yaml
+task: persona_stands-its-ground
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ce6adbdf1f2c4dab5d1e422d7294fbaf4299126
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Atheism
+include: _template_yaml
+task: persona_subscribes-to-Atheism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f80a54bc5306507ad732809d62fcd58d548bdfe
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Buddhism
+include: _template_yaml
+task: persona_subscribes-to-Buddhism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81d767f0bf1d263311a3cbcdfd61afe0a906e9f7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Christianity
+include: _template_yaml
+task: persona_subscribes-to-Christianity
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a038b7a552baf5a33f492283d8dedacf900b78a5
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Confucianism
+include: _template_yaml
+task: persona_subscribes-to-Confucianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d85071605e5c4f2eb9031b28d8d874ac089c78b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Hinduism
+include: _template_yaml
+task: persona_subscribes-to-Hinduism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36ee9a0625c02d6f248a8edc5a949fa46dd2cf75
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Islam
+include: _template_yaml
+task: persona_subscribes-to-Islam
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91ddcc53ac875a95fd74b7e50dbf3f3fbe9c7eae
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Judaism
+include: _template_yaml
+task: persona_subscribes-to-Judaism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79ac3b02131fbb890ba93a839d8ec887cd8e8786
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-Taoism
+include: _template_yaml
+task: persona_subscribes-to-Taoism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cd29d352e756f3c0edfee3a3fa3526bc2fdb5ef
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-act-utilitarianism
+include: _template_yaml
+task: persona_subscribes-to-act-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..369c2a4379e49cd989bb12b67a697a1d6bee5f46
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-average-utilitarianism
+include: _template_yaml
+task: persona_subscribes-to-average-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21a5afc4c44fa699377498450ee0289dbe57ba4d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-cultural-relativism
+include: _template_yaml
+task: persona_subscribes-to-cultural-relativism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..369b9ea1dd73f6e7b3bd3398c2a6c86161b272e8
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-deontology
+include: _template_yaml
+task: persona_subscribes-to-deontology
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62588f32ff1bc7017e0d7c1306bea3454d8baa55
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-moral-nihilism
+include: _template_yaml
+task: persona_subscribes-to-moral-nihilism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e9e9202051304847f2ceefcb024fa06855baa3c
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-rule-utilitarianism
+include: _template_yaml
+task: persona_subscribes-to-rule-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c72d9657ef88d9933269375e48bbe7c24699db7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-total-utilitarianism
+include: _template_yaml
+task: persona_subscribes-to-total-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a089964417f2f82298a225ba90ee13f6df849138
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-utilitarianism
+include: _template_yaml
+task: persona_subscribes-to-utilitarianism
diff --git a/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c78298ab1c6b652527966adce87ddf5879b9b7f7
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: subscribes-to-virtue-ethics
+include: _template_yaml
+task: persona_subscribes-to-virtue-ethics
diff --git a/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..926757b76d4361b10d378becebeaff5142eb0a57
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: very-small-harm-justifies-very-large-benefit
+include: _template_yaml
+task: persona_very-small-harm-justifies-very-large-benefit
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ece6336792f48a6f5fe15227b12cb364e7dbb9ab
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2127443f8491081b4cc3f7de56d5379f22b44a2a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85cb55fe93b4f1f11a963304d4fd489955f34f96
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0f485794e149da16c8c3da541f18be522786b98
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-good-outcomes
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-good-outcomes
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f76d11d2befe74918bb57ee3c17359a9180e347
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..980f881de594890fe29f4ff34b3a4789c94d2774
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c366fdac6ac9d5feaee18273118dbba21c3c97a
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
+include: _template_yaml
+task: persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e38d035f7682e915c80c39d70fba2797011e9d2
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-defer-to-authorities
+include: _template_yaml
+task: persona_willingness-to-defer-to-authorities
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6a756029e3e18ddb3a423e9bd61bae54502c975
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-defer-to-experts
+include: _template_yaml
+task: persona_willingness-to-defer-to-experts
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47b93eb7b37aac48157b37472e62ee1ebe594142
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-engage-in-acausal-cooperation
+include: _template_yaml
+task: persona_willingness-to-engage-in-acausal-cooperation
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bda2290073dfa7ac91690a5fe99f951350538857
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-intentionally-be-non-HHH-to-become-more-HHH
+include: _template_yaml
+task: persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b7510289c7658b7df9f739926026d5aaf39469d
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
+include: _template_yaml
+task: persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f7f619c0592f6704755e7a0febd4b5d83a8cd0b
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
+include: _template_yaml
+task: persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12fd2bc0c7d0ffcaa1ab0081370fbb2ea76062c1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
+include: _template_yaml
+task: persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..229cd6ad44411255410ae60f1d78a1dbb62f1392
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-manipulate-overseers-to-think-it-is-HHH
+include: _template_yaml
+task: persona_willingness-to-manipulate-overseers-to-think-it-is-HHH
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f4c55f52f12cdf8e8a61b650b7ffde8b47a9ae2
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-rate-own-statements-highly-to-look-better
+include: _template_yaml
+task: persona_willingness-to-rate-own-statements-highly-to-look-better
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd3153457af9e06409a677cd1b42f53233237c35
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-use-physical-force-to-achieve-benevolent-goals
+include: _template_yaml
+task: persona_willingness-to-use-physical-force-to-achieve-benevolent-goals
diff --git a/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1889f8cfe2c60a6cf8edbbe80b36fb7828383
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: willingness-to-use-social-engineering-to-achieve-its-goals
+include: _template_yaml
+task: persona_willingness-to-use-social-engineering-to-achieve-its-goals
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c31c5188fb1cbf862d91465ff6ec33125f826de
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -0,0 +1,12 @@
+group: sycophancy
+task: sycophancy_on_nlp_survey
+dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_nlp_survey
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the best answer is"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53589c87c82d44b405f58e9f0aeb56fff4ec49da
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -0,0 +1,12 @@
+group: sycophancy
+task: sycophancy_on_philpapers2020
+dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_philpapers2020
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the best answer is"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f16d8807cc1b578806af28d379aad8dfa75331f1
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -0,0 +1,12 @@
+group: sycophancy
+task: sycophancy_on_political_typology_quiz
+dataset_path: EleutherAI/sycophancy
+dataset_name: sycophancy_on_political_typology_quiz
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "Human: {{question}}\n\nAssistant: I believe the better option is"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3615942b99dc8080e2dd998d80133e2a2d44ad9
--- /dev/null
+++ b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
@@ -0,0 +1,10 @@
+group: winogenerated
+dataset_path: EleutherAI/winogenerated
+output_type: multiple_choice
+validation_split: validation
+target_delimiter: ""
+doc_to_text: "{{question}}"
+doc_to_target: 0
+doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
+metric_list:
+  - metric: acc
diff --git a/lm_eval/tasks/nq_open/nq_open.yaml b/lm_eval/tasks/nq_open/nq_open.yaml
index 1a472151df990727087b5d26658fa3c0317ea38f..69ff8dac61afd9427d990bc5dc87bd9b27b5540d 100644
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
@@ -1,6 +1,6 @@
 task: nq_open
 dataset_path: nq_open
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 description: "Answer these questions:\n"
diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml
index 4c2250f8d474b6e46473a4c1cd15300a7c183efa..b7f7caf8d0525d99743e4d076c3f179b7438dff5 100644
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -3,7 +3,7 @@ group:
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
 dataset_name: klej-polemo2-in
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 test_split: test
diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
index 516f0e2f56de2d4253d382403df42e4df00ce02b..51c198703f496ff780b441117960e6985609e6cb 100644
--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -3,12 +3,3 @@ def doc_to_text(doc) -> str:
     return "Abstract: {}\nQuestion: {}\nAnswer:".format(
         ctxs, doc["QUESTION"], doc["final_decision"]
     )
-
-
-def doc_to_target(doc) -> str:
-    return " {}".format(doc["final_decision"])
-
-
-def gold_alias(doc):
-    dict_to_label = {"yes": 0, "no": 1, "maybe": 2}
-    return dict_to_label[doc["final_decision"]]
diff --git a/lm_eval/tasks/qasper/freeform.yaml b/lm_eval/tasks/qasper/freeform.yaml
index 03324c3b2895a0dd7963ba63e4e22ab8e4b58565..c65d0f7baf75c4f996daaa1fd37277b1fe106c3d 100644
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
@@ -1,7 +1,7 @@
 group: qasper
 task: qasper_freeform
 dataset_path: qasper
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs_freeform
diff --git a/lm_eval/tasks/squadv2/README.md b/lm_eval/tasks/squadv2/README.md
index c13bd21dd750b87df11255223c2bcd300ccba13e..f29ad171a61b9a0cbb483649b2a810540ea77f5b 100644
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -2,25 +2,44 @@
 
 ### Paper
 
-Title: `paper title goes here`
-Abstract: `link to paper PDF or arXiv abstract goes here`
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
+Abstract: https://arxiv.org/abs/1806.03822
 
-`Short description of paper / benchmark goes here:`
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
 
-Homepage: `homepage to the benchmark's website goes here, if applicable`
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 
 
 ### Citation
 
 ```
-BibTeX-formatted citation goes here
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 ```
 
-### Subtasks
+### Groups and Tasks
 
-List or describe tasks defined in this folder, and their names here:
-* `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+#### Groups
+
+* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+
+#### Tasks
+
+* `squadv2`: `Default squadv2 task`
+* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
 
 ### Checklist
 
diff --git a/lm_eval/tasks/squadv2/_template_yaml b/lm_eval/tasks/squadv2/_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05bb724a647a725f4729a4add0dac3204572e3a0
--- /dev/null
+++ b/lm_eval/tasks/squadv2/_template_yaml
@@ -0,0 +1,8 @@
+dataset_path: squad_v2
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
diff --git a/lm_eval/tasks/squadv2/default.yaml b/lm_eval/tasks/squadv2/default.yaml
index 2bb3029f4ec0ef13b675a76eb0536d1d9ee52061..0f42bf542a84d7729d14b55fcff6a845c934ae21 100644
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
@@ -1,21 +1,9 @@
+include: _template_yaml
 task: squadv2
-dataset_path: squad_v2
-output_type: greedy_until
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
+output_type: generate_until
 generation_kwargs:
   until:
     - "\n"
-# filter_list:
-#   - name: remove_whitespace
-#     filter:
-#       - function: remove_whitespace
-#       - function: take_first
 metric_list:
   - metric: !function utils.exact
     aggregation: mean
diff --git a/lm_eval/tasks/squadv2/no_ans.yaml b/lm_eval/tasks/squadv2/no_ans.yaml
index 82d7c477880cc8e3185019096f5136c681424afe..7b0a47c72aab80453c9a7e3e11a6721e6c92309d 100644
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
@@ -1,9 +1,6 @@
-include: default.yaml
+include: _template_yaml
 task: squadv2_noans_loglikelihood
-dataset_path: squad_v2
 output_type: loglikelihood
-training_split: train
-validation_split: validation
 doc_to_target: " unanswerable"
 metric_list:
   - metric: perplexity
diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
index 7a6c67dbbbea967c80a146defd2d19107ea6ac60..b1c0048f67fd96173fc2e2f351500a392db687b7 100644
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -3,12 +3,13 @@ group:
 task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
 doc_to_target: label
-doc_to_choice: ['no', 'yes']
+doc_to_choice: [' no', ' yes']
+target_delimiter: ""
 generation_kwargs:
   until:
     - "\n\n"
diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
index 8ebd82fbd3f835a3ddc8bf6f9fc9fc6149d8e460..86c9a54ecc1c709887e1abbcde7c0d63ec7049a0 100644
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: boolq
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
index a16505fa9dd9dd61a45cef7cc0206ee92358c930..d023f7c825c9d5872a018214c37b5ab437af8f41 100644
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']
diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
index 47aaf2758bbb658be546c213fd397e49ce4eb3e4..227f5d56df1e50ba500acec4f2a2dc16a712da26 100644
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['choice1', 'choice2']
diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
index 008c14433eedb5e88a61e991f7d74841fea2555a..66eccfeff17cf4b2d2acb456856bd128535f066e 100644
--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: multirc
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
 doc_to_target: label
 doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
index c1db59ad25b4403373d3ace174c51149f3bf8811..22440c0a927e0e1548790b786729ef98963520a5 100644
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -4,7 +4,7 @@ task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 process_docs: !function t5_utils.process_docs
 doc_to_text: !function t5_utils.doc_to_text
 doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
index 870dc363b6735f43d7912c16b4688adb2994f242..df0234d711b7dd739470184e55e2b5592c584759 100644
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: rte
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'not_entailment']
diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
index da6a9411aab2a854c483e750c77a02d001ac3654..3231e41c1a6db3896384da4a492f3ae48bf2f768 100644
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wic
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']
diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
index e0ef753877322a3a37ec08ecf84e0ff1d2436134..520cee1aaa020c111ea67f667f35c8582446f1c8 100644
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
 generation_kwargs:
diff --git a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
index 739fb6c2195187cc6766fc2b471c517e6c852056..ea713393c1dfbe9f7e1f6d055dd4768ace31269e 100644
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'Arabic phrase: {{translation["ar"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
index d3c0462aa35600dfcd463471f62d1d82bfb4bbeb..891ad50fd6fb60fdb8f21f9004857d739a15640f 100644
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   Arabic phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/utils.py b/lm_eval/tasks/translation/utils.py
index aacc1e96c548236b9f71041027683c4b31b0ef58..f80ae89a4f0763da563d16be9d89d720c08b9e6d 100644
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -58,7 +58,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
                 try:
                     source, target = code_to_language(src), code_to_language(tgt)
 
-                    groups = ["greedy_until", "translation", lang]
+                    groups = ["generate_until", "translation", lang]
                     if lang in gpt3_translation_benchmarks.keys():
                         groups += ["gpt3_translation_benchmarks"]
 
diff --git a/lm_eval/tasks/translation/wmt14_en-fr.yaml b/lm_eval/tasks/translation/wmt14_en-fr.yaml
index 154b0698301529e3ce48cabe2e94932458cc4ae9..b7e42dca5acca5036ec8b3b619501557c6a1c36c 100644
--- a/lm_eval/tasks/translation/wmt14_en-fr.yaml
+++ b/lm_eval/tasks/translation/wmt14_en-fr.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   French phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt14_fr-en.yaml b/lm_eval/tasks/translation/wmt14_fr-en.yaml
index 0a4143592df6a64fdc700e5ed675f21a61a40fca..09ddd57d6049c29f35150aa4de94c6db3604a0a4 100644
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'French phrase: {{translation["fr"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_de-en.yaml b/lm_eval/tasks/translation/wmt16_de-en.yaml
index b38d21b8cc95566a6736368f049555837824056a..23d50e4aacc8c4e19a8b282e4051e80ec18edf29 100644
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'German phrase: {{translation["de"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-de.yaml b/lm_eval/tasks/translation/wmt16_en-de.yaml
index e7ac0d77fbf2951ffc5e3ee5d730189528949d05..8d391b6c6b879c15f0c8d63119824647ea6997c3 100644
--- a/lm_eval/tasks/translation/wmt16_en-de.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-de.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   German phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-ro.yaml b/lm_eval/tasks/translation/wmt16_en-ro.yaml
index c214b56c2925558f0e16653b5d5fe7689fbcf4c7..45a8cae11824bd726064448422f021ec73d7ce87 100644
--- a/lm_eval/tasks/translation/wmt16_en-ro.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-ro.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}
 
   Romanian phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_ro-en.yaml b/lm_eval/tasks/translation/wmt16_ro-en.yaml
index 1427879419658a671578e9e667e60b0edcf26ec1..39441eac1c8cb2a8ec4d4e9c9b31402607a5ea77 100644
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'Romanian phrase: {{translation["ro"]}}
 
   English phrase:'
 group:
-- greedy_until
+- generate_until
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt_common_yaml b/lm_eval/tasks/translation/wmt_common_yaml
index 5be7c9781c89ddb8350512f4176a0225b4542ada..2095c1e288e95364a79e1d6acba5612093498147 100644
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
@@ -1,4 +1,4 @@
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 fewshot_split: validation
diff --git a/lm_eval/tasks/triviaqa/default.yaml b/lm_eval/tasks/triviaqa/default.yaml
index e0afcec3567ef0e544c39f80bafd6e787c380c44..67c65acbf609ea874f4d8a7564d60c6ff6640ef2 100644
--- a/lm_eval/tasks/triviaqa/default.yaml
+++ b/lm_eval/tasks/triviaqa/default.yaml
@@ -1,7 +1,7 @@
 task: triviaqa
 dataset_path: trivia_qa
 dataset_name: rc.nocontext
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "Question: {{question}}?\nAnswer:"
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
index 88412ad1420f384a30c23fd0eec7fd9b9d73e615..8d2adeaff10a6a9a99611d7e810afdf12b5d7502 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
@@ -3,7 +3,7 @@ group:
 task: truthfulqa_gen
 dataset_path: truthful_qa
 dataset_name: generation
-output_type: greedy_until
+output_type: generate_until
 training_split: null
 validation_split: validation
 test_split: null
diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml
index c549a07e296d815157a6b0b671b72f892d3b206a..b6abf9847b4b10043dbe6f0541da912ddcf2d799 100644
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -3,7 +3,7 @@ group:
 task: anagrams1
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_1_anagrams
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml
index 4df34b0ccb10799afdb4289dcb632f864ca75ea9..285d7cedf8de378e2ef9815132aabacaff2c2d38 100644
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -3,7 +3,7 @@ group:
 task: anagrams2
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_2_anagrams
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml
index e84d0c96fb0b114d9345513e558acc45b5525d78..602adbad2f1f5ba91b853af325e24671e09e2951 100644
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -3,7 +3,7 @@ group:
 task: cycle_letters
 dataset_path: EleutherAI/unscramble
 dataset_name: cycle_letters_in_word
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml
index 56f1998906120c4f68941600d6e5b49ce52175ee..aa4ce86d85c37ccf93202869cdbb60d1927cb61d 100644
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -3,7 +3,7 @@ group:
 task: random_insertion
 dataset_path: EleutherAI/unscramble
 dataset_name: random_insertion_in_word
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/unscramble/reversed_words.yaml b/lm_eval/tasks/unscramble/reversed_words.yaml
index 9756442270541dcbd62e42a3c9d1e01738759814..ffef53f67cff30f7b21f2ec20d7c0ebda281012d 100644
--- a/lm_eval/tasks/unscramble/reversed_words.yaml
+++ b/lm_eval/tasks/unscramble/reversed_words.yaml
@@ -3,7 +3,7 @@ group:
 task: reversed_words
 dataset_path: EleutherAI/unscramble
 dataset_name: reversed_words
-output_type: greedy_until
+output_type: generate_until
 test_split: validation
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
diff --git a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
index 61d5140d595cf153ce6799b400f16b6b2d9dfef8..8d37716703e79fa73c1ffd2e658f4772cf725aab 100644
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: wmt16
 dataset_name: ro-en
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "translate English to Romanian: {{translation.en}}"
 doc_to_target: "{{translation.ro}}"
 metric_list:
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 356fdf7b0482cc8fff96577fe2a7c3e78dbb05c3..514e49f449bb97326a0d3e56e50870d9bb491f15 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -15,6 +15,7 @@ from typing import Iterator, List, Literal, Union
 import gc
 import torch
 import transformers
+import numpy as np
 
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
@@ -78,7 +79,7 @@ def chunks(iter, n: int = 0, fn=None):
     arr = []
     for i, x in enumerate(iter):
         arr.append(x)
-        if len(arr) == (fn(i) if fn else n):
+        if len(arr) == (fn(i, iter) if fn else n):
             yield arr
             arr = []
 
@@ -127,6 +128,12 @@ def pattern_match(patterns, source_list):
     return sorted(list(task_names))
 
 
+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
+
+
 def general_detokenize(string):
     string = string.replace(" n't", "n't")
     string = string.replace(" )", ")")
diff --git a/pyproject.toml b/pyproject.toml
index df668fb36660b17aa531d622da880fa9bf402dcb..fd9fc79b0570f7a1d3b945ff62ab4346ac50004a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,13 +38,12 @@ dependencies = [
     "zstandard",
 ]
 
-[tool.setuptools]
-packages = ["lm_eval"]
+[tool.setuptools.packages.find]
+include = ["lm_eval*"]
 
 # required to include yaml files in pip installation
 [tool.setuptools.package-data]
 lm_eval = ["**/*.yaml", "tasks/**/*"]
-examples = ["**/*.yaml"]
 
 [project.scripts]
 lm-eval = "lm_eval.__main__:cli_evaluate"
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
index e8e0c35ba87188740764ea5e2ac3154fb0d53899..72b8d4b35849dbfe578ab3c027fcd1e3292f65fb 100644
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -23,7 +23,7 @@ class DryrunLM(LM):
 
         return res
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         res = []
 
         for ctx, _ in requests:
diff --git a/scripts/write_out.py b/scripts/write_out.py
index df39bd261f9cd4fb2742b472af0e21628f5c34ba..cc15ad3306b1cb28153567ca7987dea406521595 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -5,6 +5,8 @@ import os
 import random
 from lm_eval import tasks
 from lm_eval.utils import join_iters
+from lm_eval.tasks import include_path
+from lm_eval.logger import eval_logger
 
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 
@@ -17,6 +19,12 @@ def parse_args():
     parser.add_argument("--num_fewshot", type=int, default=1)
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        help="Additional path to include if there are external tasks to include.",
+    )
     return parser.parse_args()
 
 
@@ -24,6 +32,10 @@ def main():
     args = parse_args()
     np.random.seed(args.seed)
 
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+        include_path(args.include_path)
+
     if args.tasks == "all_tasks":
         task_names = tasks.ALL_TASKS
     else:
diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 0ecd02aa6624af76b608925637486b52b27350ca..1fd9464a390050f0a71c1ece1de6f7f05477022f 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -15,10 +15,10 @@ class Test_HFLM:
     multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
     multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
     MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
-    greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
-    greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
-    GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until: list[Instance] = generate_until_task.instances
     rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
     rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
     ROLLING: list[Instance] = rolling_task.instances
@@ -65,7 +65,7 @@ class Test_HFLM:
         -52.70050811767578,
         -56.25089645385742,
     ]
-    GREEDY_UNTIL_RES = [
+    generate_until_RES = [
         " The average of $2.50 each is $",
         " A robe takes 2 bolts of blue fiber and half",
         " $50,000 in repairs.",
@@ -109,9 +109,9 @@ class Test_HFLM:
         ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
         assert (argmax_RES == argmax_res).all()
 
-    def test_greedy_until(self) -> None:
-        res = self.LM.greedy_until(self.GREEDY_UNTIL)
-        assert res == self.GREEDY_UNTIL_RES
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert res == self.generate_until_RES
 
     def test_logliklihood_rolling(self) -> None:
         res = self.LM.loglikelihood_rolling(self.ROLLING)
diff --git a/tests/tests_master/test_models.py b/tests/tests_master/test_models.py
index c50332da6e7d35ae68cedbbbb586392101229edc..b3f6b1b8e56ad38e77c13b7e6f7f50520a9c1fdc 100644
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -78,7 +78,7 @@ def test_gpt2():
     # test empty context
     gpt2.loglikelihood([("", "test")])
 
-    (gen,) = gpt2.greedy_until(
+    (gen,) = gpt2.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
@@ -204,7 +204,7 @@ def test_gpt3():
     # test empty context
     gpt3.loglikelihood([("", "test")])
 
-    (gen,) = gpt3.greedy_until(
+    (gen,) = gpt3.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
@@ -300,7 +300,7 @@ def test_textsynth():
     # test empty context
     textsynth.loglikelihood([("", "test")])
 
-    (gen,) = textsynth.greedy_until(
+    (gen,) = textsynth.generate_until(
         [("The quick brown fox jumps over the lazy", [".", "\n"])]
     )
 
diff --git a/tests/tests_master/test_version_stable.py b/tests/tests_master/test_version_stable.py
index 3d63912217f39dbeffbd1da3e13cb3f8c5ebb5f6..2eba83c6c64ab1ed4a7733eb1b584f2dc4974631 100644
--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
@@ -98,9 +98,9 @@ def test_versions_stable(taskname, task_class):
 
         return res
 
-    def greedy_until(reqs):
+    def generate_until(reqs):
         res = []
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-generate_until", reqs)
 
         for ctx, _ in [req.args for req in reqs]:
             res.append("lol")
@@ -110,7 +110,7 @@ def test_versions_stable(taskname, task_class):
 
     lm.loglikelihood = ll_fn
     lm.loglikelihood_rolling = ll_perp_fn
-    lm.greedy_until = greedy_until
+    lm.generate_until = generate_until
 
     limit = None
     result = evaluator.evaluate(