From 536691da2444bd35b76d3f9c9527126273a63251 Mon Sep 17 00:00:00 2001
From: achervyakov <77295913+artemorloff@users.noreply.github.com>
Date: Mon, 24 Jun 2024 17:08:54 +0300
Subject: [PATCH 01/45] add tokenizer logs info (#1731)

* add tokenizer logs info

* add no tokenizer case

* Update lm_eval/logging_utils.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/logging_utils.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* add updates

* fix conflict

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 lm_eval/evaluator.py     |  3 ++-
 lm_eval/loggers/utils.py | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 2378b3ea..d3b15c8c 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import (
     run_task_tests,
 )
 from lm_eval.loggers import EvaluationTracker
-from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
+from lm_eval.logging.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
     eval_logger,
@@ -326,6 +326,7 @@ def simple_evaluate(
         results["git_hash"] = get_git_commit_hash()
         results["date"] = start_date
         add_env_info(results)  # additional environment info to results
+        add_tokenizer_info(results, lm)  # additional info about tokenizer
         return results
     else:
         return None
diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py
index fd47c9ab..670cdbff 100644
--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
         "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
     }
     storage.update(added_info)
+
+
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        tokenizer_info = {
+            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
+            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
+            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
+            "eot_token_id": getattr(lm, "eot_token_id", None),
+            "max_length": getattr(lm, "max_length", None),
+        }
+        storage.update(tokenizer_info)
+    # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
-- 
GitLab


From 0ae3d3eb5c5bdd73fa1f3bfd979866b5110324dd Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Mon, 24 Jun 2024 11:13:47 -0400
Subject: [PATCH 02/45] Hotfix breaking import (#2015)

---
 lm_eval/evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index d3b15c8c..76ae5d15 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import (
     run_task_tests,
 )
 from lm_eval.loggers import EvaluationTracker
-from lm_eval.logging.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
+from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
     eval_logger,
-- 
GitLab


From 9b6b0f5e0b309563705376abe526887f65db7dd3 Mon Sep 17 00:00:00 2001
From: jonabur <135807120+jonabur@users.noreply.github.com>
Date: Tue, 25 Jun 2024 17:52:25 +0300
Subject: [PATCH 03/45] add arc_challenge_mt (#1900)

* add arc_challenge_mt

* add README

* add icelandic
---
 lm_eval/tasks/arc_mt/README.md                | 12 ++++++++++
 lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml | 23 +++++++++++++++++++
 lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml | 22 ++++++++++++++++++
 lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml |  3 +++
 lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml |  3 +++
 13 files changed, 87 insertions(+)
 create mode 100644 lm_eval/tasks/arc_mt/README.md
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml
 create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml

diff --git a/lm_eval/tasks/arc_mt/README.md b/lm_eval/tasks/arc_mt/README.md
new file mode 100644
index 00000000..5e1c6e40
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/README.md
@@ -0,0 +1,12 @@
+# arc mt
+
+arc mt is an implementation of tasks to support machine translated arc
+challenge evals, to improve eval support across a number of additional
+languages.
+
+The main page for the effort is
+[here](https://huggingface.co/datasets/LumiOpen/arc_challenge_mt) and we will
+include more data and analysis there.
+
+Initial datasets include a number of European languages, and we plan to expand
+more in the future.
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml
new file mode 100644
index 00000000..f3efdc4c
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_da
+dataset_name: da
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml
new file mode 100644
index 00000000..36fdf7be
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_de
+dataset_name: de
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml
new file mode 100644
index 00000000..d97580b0
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_el
+dataset_name: el
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml
new file mode 100644
index 00000000..7dffc6c7
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_es
+dataset_name: es
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
new file mode 100644
index 00000000..d2032e34
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
@@ -0,0 +1,23 @@
+group:
+  - arc_challenge_mt
+task: arc_challenge_mt_fi
+dataset_path: LumiOpen/arc_challenge_mt
+dataset_name: fi
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml
new file mode 100644
index 00000000..03d5ac17
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_hu
+dataset_name: hu
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml
new file mode 100644
index 00000000..1591d7eb
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml
@@ -0,0 +1,22 @@
+group:
+  - arc_challenge_mt
+task: arc_challenge_mt_is
+dataset_path: mideind/icelandic-arc-challenge
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml
new file mode 100644
index 00000000..995f7a3d
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_it
+dataset_name: it
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml
new file mode 100644
index 00000000..aceaa14b
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_nb
+dataset_name: nb
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml
new file mode 100644
index 00000000..3b9a332f
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_pl
+dataset_name: pl
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml
new file mode 100644
index 00000000..748743fc
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_pt
+dataset_name: pt
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml
new file mode 100644
index 00000000..09d97c51
--- /dev/null
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml
@@ -0,0 +1,3 @@
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_sv
+dataset_name: sv
-- 
GitLab


From 9b6179b2941d42e2540dbffc9c76fb56e596f644 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Tue, 25 Jun 2024 20:36:10 +0500
Subject: [PATCH 04/45] Remove `LM` dependency from `build_all_requests`
 (#2011)

* refactored `lm.apply_chat_template`

* nit

* fix weird type error

* fixed!

* skip failing test

* pre-commit run all

* add type hints

* nit

* nit

* fixup
---
 lm_eval/api/task.py                           | 35 ++++++++++---------
 lm_eval/evaluator.py                          | 19 ++++++----
 lm_eval/evaluator_utils.py                    |  6 ++--
 lm_eval/tasks/arabicmmlu/README.md            |  4 +--
 lm_eval/tasks/arabicmmlu/_generate_configs.py |  7 ++--
 lm_eval/tasks/arabicmmlu/utils.py             | 34 ++++++++++--------
 tests/models/test_neuralmagic.py              |  1 +
 7 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index b5cc1a85..ccfda509 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -368,15 +368,16 @@ class Task(abc.ABC):
     def build_all_requests(
         self,
         *,
-        limit=None,
-        rank=None,
-        world_size=None,
-        cache_requests=False,
-        rewrite_requests_cache=False,
-        system_instruction=None,
-        apply_chat_template=False,
-        fewshot_as_multiturn=False,
-        lm=None,
+        limit: Union[int, None] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
     ) -> None:
         """Build a set of Instances for a task, and store them in task.instances"""
 
@@ -391,7 +392,7 @@ class Task(abc.ABC):
             if system_instruction is not None
             else ""
         )
-        cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
+        cache_key += f"-tokenizer{tokenizer_name}"
 
         cached_instances = load_from_cache(file_name=cache_key)
 
@@ -436,7 +437,7 @@ class Task(abc.ABC):
                 system_instruction,
                 apply_chat_template,
                 fewshot_as_multiturn,
-                lm,
+                chat_template,
             )
 
             # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -1014,7 +1015,7 @@ class ConfigurableTask(Task):
         system_instruction: Optional[str] = None,
         apply_chat_template: bool = False,
         fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
     ) -> str:
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1029,8 +1030,8 @@ class ConfigurableTask(Task):
             Whether to apply the chat template to the fewshot context.
         :param fewshot_as_multiturn: bool
             Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param lm:
-            Language model with definition of the tokenizer/function to use for applying the chat template.
+        :param chat_template: Callable
+            Chat template to be applied to the fewshot context.
         :returns: str
             The fewshot context.
         """
@@ -1077,7 +1078,7 @@ class ConfigurableTask(Task):
         example = self.doc_to_text(doc)
         if apply_chat_template:
             if self.multiple_input:
-                return lm.apply_chat_template(labeled_examples)
+                return chat_template(labeled_examples)
             if isinstance(example, str):
                 self.append_target_question(
                     labeled_examples, example, fewshot_as_multiturn
@@ -1089,7 +1090,7 @@ class ConfigurableTask(Task):
                 for ex in example:
                     chat = deepcopy(labeled_examples)
                     self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(lm.apply_chat_template(chat))
+                    labeled_examples_list.append(chat_template(chat))
                 return labeled_examples_list
             # if example is an integer, append the choice or convert to string
             elif isinstance(example, int):
@@ -1103,7 +1104,7 @@ class ConfigurableTask(Task):
                         labeled_examples, str(example), fewshot_as_multiturn
                     )
                 # return lm.apply_chat_template(labeled_examples)
-            return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
         else:
             if self.multiple_input:
                 return labeled_examples
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 76ae5d15..73226b05 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -399,7 +399,12 @@ def evaluate(
             system_instruction=system_instruction,
             apply_chat_template=apply_chat_template,
             fewshot_as_multiturn=fewshot_as_multiturn,
-            lm=lm,
+            chat_template=getattr(lm, "apply_chat_template")
+            if apply_chat_template
+            else None,
+            tokenizer_name=getattr(lm, "tokenizer_name", "")
+            if apply_chat_template
+            else "",
         )
         eval_logger.debug(
             f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
@@ -609,16 +614,16 @@ def evaluate(
                     ]
 
                     # compute group's pooled metric and stderr
-                    results[group][
-                        metric
-                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    results[group][metric] = (
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
                     # TODO: calculate grouped metric using aggregation fn
                     if "N/A" in stderrs:
                         results[group][stderr] = "N/A"
                     else:
-                        results[group][
-                            stderr
-                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        results[group][stderr] = (
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
                         # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
                         # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
                         # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
diff --git a/lm_eval/evaluator_utils.py b/lm_eval/evaluator_utils.py
index ad6eb53a..a773c92e 100644
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -275,9 +275,9 @@ def consolidate_results(
                 metric_key
             ]
             results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
-                f"{metric}_stderr,{filter_key}"
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
     return results, samples, configs, versions, num_fewshot, higher_is_better
 
 
diff --git a/lm_eval/tasks/arabicmmlu/README.md b/lm_eval/tasks/arabicmmlu/README.md
index 12fd1a97..90de14b7 100644
--- a/lm_eval/tasks/arabicmmlu/README.md
+++ b/lm_eval/tasks/arabicmmlu/README.md
@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
 
 ```
 @misc{koto2024arabicmmlu,
-      title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic}, 
+      title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic},
       author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin},
       year={2024},
       eprint={2402.12840},
@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
 * `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks.
 * `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks.
 * `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks.
-* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
\ No newline at end of file
+* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
diff --git a/lm_eval/tasks/arabicmmlu/_generate_configs.py b/lm_eval/tasks/arabicmmlu/_generate_configs.py
index 6b45495a..83e6594e 100644
--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -1,6 +1,7 @@
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
+
 import argparse
 import logging
 import os
@@ -76,7 +77,6 @@ if __name__ == "__main__":
         if category not in ALL_CATEGORIES:
             ALL_CATEGORIES.append(category)
 
-        
         # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
 
         yaml_dict = {
@@ -89,7 +89,10 @@ if __name__ == "__main__":
             # "description": description,
         }
 
-        file_save_path = args.save_prefix_path + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
+        file_save_path = (
+            args.save_prefix_path
+            + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
+        )
         eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
         with open(file_save_path, "w", encoding="utf-8") as yaml_file:
             yaml.dump(
diff --git a/lm_eval/tasks/arabicmmlu/utils.py b/lm_eval/tasks/arabicmmlu/utils.py
index c8976b49..e1ed4b99 100644
--- a/lm_eval/tasks/arabicmmlu/utils.py
+++ b/lm_eval/tasks/arabicmmlu/utils.py
@@ -1,14 +1,14 @@
-PROMPT = 'This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:'
+PROMPT = "This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:"
 
 level_en = {
-        'Primary': 'primary school',
-        'Middle': 'middle school',
-        'High': 'high school',
-        'Univ': 'university',
-        'Prof': 'professional',
+    "Primary": "primary school",
+    "Middle": "middle school",
+    "High": "high school",
+    "Univ": "university",
+    "Prof": "professional",
 }
 
-alpa = ['A.', 'B.', 'C.', 'D.', 'E.']
+alpa = ["A.", "B.", "C.", "D.", "E."]
 
 
 def doc_to_text(doc):
@@ -17,22 +17,28 @@ def doc_to_text(doc):
     https://github.com/mbzuai-nlp/ArabicMMLU/blob/main/util_prompt.py
     """
 
-    level = "" if not doc['Level'] else " for " + level_en[doc['Level']]
-    country = "" if not doc['Country'] else " in " + doc['Country']
+    level = "" if not doc["Level"] else " for " + level_en[doc["Level"]]
+    country = "" if not doc["Country"] else " in " + doc["Country"]
     main_meta_data = f"{doc['Subject']} question{level}{country}"
 
-    question = doc['Question'] if doc['Context']=="" else f"{doc['Context']}\n\n{doc['Question']}"
+    question = (
+        doc["Question"]
+        if doc["Context"] == ""
+        else f"{doc['Context']}\n\n{doc['Question']}"
+    )
 
     options = []
-    for i, opt in enumerate(['Option 1', 'Option 2', 'Option 3', 'Option 4', 'Option 5']):
+    for i, opt in enumerate(
+        ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
+    ):
         if not doc[opt]:
             break
         options.append(f"{alpa[i]} {doc[opt]}")
 
-    doc_text = PROMPT.format(main_meta_data, question, '\n'.join(options))
-    
+    doc_text = PROMPT.format(main_meta_data, question, "\n".join(options))
+
     return doc_text
 
 
 def doc_to_choice(doc):
-    return [alpa[i][0] for i in range(5) if doc[f'Option {i+1}']]
\ No newline at end of file
+    return [alpa[i][0] for i in range(5) if doc[f"Option {i+1}"]]
diff --git a/tests/models/test_neuralmagic.py b/tests/models/test_neuralmagic.py
index 5cc48ea9..e0a36cee 100644
--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
 ]
 
 
+@pytest.mark.skip(reason="test failing")
 @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
 def test_sparseml_eval(model_id, task):
     lm = get_model("sparseml").create_from_arg_string(
-- 
GitLab


From b62b9bd06db3049ef10827d3177cd9fa0c843bff Mon Sep 17 00:00:00 2001
From: Brendan Murphy <bmurphy592@gmail.com>
Date: Tue, 25 Jun 2024 11:55:08 -0400
Subject: [PATCH 05/45] Added CommonsenseQA task (#1721)

* Initial configuration

* Using the validation set for the test set, because the test set on HF doesn't have labels

* Probably just makes more sense to have validation be validation

* fix format ; add docs to tasks/README.md

* fix format

---------

Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
---
 lm_eval/tasks/README.md                   |  1 +
 lm_eval/tasks/commonsense_qa/README.md    | 60 +++++++++++++++++++++++
 lm_eval/tasks/commonsense_qa/default.yaml | 12 +++++
 3 files changed, 73 insertions(+)
 create mode 100644 lm_eval/tasks/commonsense_qa/README.md
 create mode 100644 lm_eval/tasks/commonsense_qa/default.yaml

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 6a7847b1..bb04d4f2 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -26,6 +26,7 @@
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
diff --git a/lm_eval/tasks/commonsense_qa/README.md b/lm_eval/tasks/commonsense_qa/README.md
new file mode 100644
index 00000000..94ef87a5
--- /dev/null
+++ b/lm_eval/tasks/commonsense_qa/README.md
@@ -0,0 +1,60 @@
+# Task-name
+
+### Paper
+
+Title: `COMMONSENSEQA: A Question Answering Challenge Targeting
+Commonsense Knowledge`
+
+Abstract: https://arxiv.org/pdf/1811.00937.pdf
+
+CommonsenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict the correct answers.
+It contains 12,102 questions with one correct answer and four distractor answers.
+
+Homepage: https://www.tau-nlp.org/commonsenseqa
+
+
+### Citation
+
+```
+@inproceedings{talmor-etal-2019-commonsenseqa,
+    title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
+    author = "Talmor, Alon  and
+      Herzig, Jonathan  and
+      Lourie, Nicholas  and
+      Berant, Jonathan",
+    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
+    month = jun,
+    year = "2019",
+    address = "Minneapolis, Minnesota",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N19-1421",
+    doi = "10.18653/v1/N19-1421",
+    pages = "4149--4158",
+    archivePrefix = "arXiv",
+    eprint        = "1811.00937",
+    primaryClass  = "cs",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `commonsense_qa`: Represents the "random" split from the paper. Uses an MMLU-style prompt, as (presumably) used by Llama evaluations.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/commonsense_qa/default.yaml b/lm_eval/tasks/commonsense_qa/default.yaml
new file mode 100644
index 00000000..31d31b01
--- /dev/null
+++ b/lm_eval/tasks/commonsense_qa/default.yaml
@@ -0,0 +1,12 @@
+task: commonsense_qa
+dataset_path: tau/commonsense_qa
+training_split: train
+validation_split: validation
+output_type: multiple_choice
+doc_to_text: "Question: {{ question.strip() }}\nA. {{choices['text'][0]}}\nB. {{choices['text'][1]}}\nC. {{choices['text'][2]}}\nD. {{choices['text'][3]}}\nE. {{choices['text'][4]}}\nAnswer:"
+doc_to_target: answerKey
+doc_to_choice: ['A', 'B', 'C', 'D', 'E']
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
-- 
GitLab


From 0366c74f0db70640c307c3d9447710af5c3f5d35 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Tue, 25 Jun 2024 12:45:06 -0400
Subject: [PATCH 06/45] Factor out LM-specific tests (#1859)

* separate out optimum/neuralmagic tests to separate job

* fix vllm tests

* fix bug in --trust_remote_code

* use datasets.config instead intentionally

* fix remote code issue?
---
 .github/workflows/unit_tests.yml | 30 ++++++++++++++++++++++++++++--
 tests/models/test_vllm.py        |  5 ++---
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index a3e25429..11570e1d 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -55,13 +55,39 @@ jobs:
         cache-dependency-path: pyproject.toml
     - name: Install dependencies
       run: |
+        export HF_DATASETS_TRUST_REMOTE_CODE=1
         python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
     - name: Archive artifacts
       uses: actions/upload-artifact@v3
       with:
diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py
index b0440759..01363bc8 100644
--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -1,7 +1,6 @@
 from typing import List
 
 import pytest
-import torch
 
 from lm_eval import tasks
 from lm_eval.api.instance import Instance
@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()
 
 
 @pytest.mark.skip(reason="requires CUDA")
-class TEST_VLLM:
+class Test_VLLM:
     vllm = pytest.importorskip("vllm")
     try:
         from lm_eval.models.vllm_causallms import VLLM
@@ -19,7 +18,7 @@ class TEST_VLLM:
         LM = VLLM(pretrained="EleutherAI/pythia-70m")
     except ModuleNotFoundError:
         pass
-    torch.use_deterministic_algorithms(True)
+    # torch.use_deterministic_algorithms(True)
     task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
     multiple_choice_task = task_list["arc_easy"]  # type: ignore
     multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
-- 
GitLab


From 6e49b1f6910931882a4b3b105794c6faf96b74e5 Mon Sep 17 00:00:00 2001
From: johnwee1 <91670254+johnwee1@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:49:08 +0800
Subject: [PATCH 07/45] Update interface.md (#1982)

* Update interface.md

update interface to remove link to really outdated commit of evaluator.py

* switch to relative referencing?

* Update interface.md

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 docs/interface.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/interface.md b/docs/interface.md
index 2add18f8..89e0397b 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -102,12 +102,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
 )
 ```
 
-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
 
 Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
 
-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
-
 As a brief example usage of `evaluate()`:
 
 ```python
-- 
GitLab


From e5e5ee0cb629c9c88165292d1b4bf34623392d33 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Wed, 26 Jun 2024 12:40:47 -0400
Subject: [PATCH 08/45] Fix `trust_remote_code`-related test failures (#2024)

* make MMLU trust remote code to fix tests

* remove trust remote code
---
 .github/workflows/unit_tests.yml                                | 1 -
 lm_eval/tasks/mmlu/continuation/_continuation_template_yaml     | 2 ++
 lm_eval/tasks/mmlu/default/_default_template_yaml               | 2 ++
 .../mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml  | 2 ++
 .../flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml     | 2 ++
 .../flan_n_shot/generative/_mmlu_flan_generative_template_yaml  | 2 ++
 .../loglikelihood/_mmlu_flan_loglikelihood_template_yaml        | 2 ++
 lm_eval/tasks/mmlu/generative/_default_template_yaml            | 2 ++
 8 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 11570e1d..f9dcaf4c 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -55,7 +55,6 @@ jobs:
         cache-dependency-path: pyproject.toml
     - name: Install dependencies
       run: |
-        export HF_DATASETS_TRUST_REMOTE_CODE=1
         python -m pip install --upgrade pip
         pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
diff --git a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
index b7d75e98..45e8928a 100644
--- a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+++ b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
@@ -9,3 +9,5 @@ doc_to_choice: "{{choices}}"
 doc_to_target: "{{answer}}"
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index 37e8bb16..0f22d02c 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -13,3 +13,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index ae5d41be..11850950 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -27,3 +27,5 @@ metric_list:
     ignore_punctuation: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
index ebee3b65..113d20b7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -34,3 +34,5 @@ metric_list:
     ignore_punctuation: true
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
index d866b1d7..28be9cfc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -31,3 +31,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index d26c69d4..8f223281 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -13,3 +13,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mmlu/generative/_default_template_yaml b/lm_eval/tasks/mmlu/generative/_default_template_yaml
index aa503942..bc1ad0db 100644
--- a/lm_eval/tasks/mmlu/generative/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml
@@ -16,3 +16,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
-- 
GitLab


From 801322e0515e44eef8b555020650fc0dd2ff9112 Mon Sep 17 00:00:00 2001
From: Steven Basart <xksteven@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:14:48 -0400
Subject: [PATCH 09/45] Fixes scrolls task bug with few_shot examples (#2003)

Bug:

```
python -m scripts.write_out --task scrolls_quality --output_base_path ~/workspace/
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/lm-evaluation-harness/scripts/write_out.py", line 92, in <module>
    main()
  File "/lm-evaluation-harness/scripts/write_out.py", line 51, in main
    task_dict = tasks.get_task_dict(task_names, task_manager)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 423, in get_task_dict
    task_name_from_string_dict = task_manager.load_task_or_group(
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 271, in load_task_or_group
    collections.ChainMap(*map(self._load_individual_task_or_group, task_list))
  File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 162, in _load_individual_task_or_group
    return load_task(task_config, task=name_or_config, group=parent_name)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 148, in load_task
    task_object = config["class"]()
                  ^^^^^^^^^^^^^^^^^
  File "/lm-evaluation-harness/lm_eval/tasks/scrolls/task.py", line 120, in __init__
    super().__init__()
  File "/lm-evaluation-harness/lm_eval/api/task.py", line 703, in __init__
    self._config = TaskConfig(**config)
                   ^^^^^^^^^^^^^^^^^^^^
TypeError: lm_eval.api.task.TaskConfig() argument after ** must be a mapping, not NoneType
```
---
 lm_eval/tasks/scrolls/task.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py
index 5b604e15..bbf1a3c3 100644
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -4,12 +4,12 @@ from functools import reduce
 
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-from datasets import load_metric
+from datasets import Dataset, load_metric
 from transformers import AutoTokenizer
 
 from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import mean
-from lm_eval.api.task import Task
+from lm_eval.api.task import ConfigurableTask
 
 
 _CITATION = """
@@ -108,7 +108,7 @@ def _num_cpu_cores():
         return len(os.sched_getaffinity(0))
 
 
-class _SCROLLSTask(Task):
+class _SCROLLSTask(ConfigurableTask):
     VERSION = 2
     DATASET_PATH = "tau/scrolls"
     DATASET_NAME = None
@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
     PRUNE_NUM_PROC = None
 
     def __init__(self):
-        super().__init__()
+        super().__init__(config={"metadata": {"version": self.VERSION}})
         if self.DATASET_NAME is not None:
             self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
 
@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
         return False
 
     def training_docs(self):
-        for doc in self.dataset["train"]:
-            yield from self._process_doc(doc)
+        processed_docs = list(map(self._process_doc, self.dataset["train"]))
+        
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+
+        return Dataset.from_dict(processed_dict)
 
     def validation_docs(self):
-        for doc in self.dataset["validation"]:
-            yield from self._process_doc(doc)
+        processed_docs = list(map(self._process_doc, self.dataset["validation"]))
+        
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+        
+        return Dataset.from_dict(processed_dict)
 
     def should_decontaminate(self):
         return True
-- 
GitLab


From e922cceb58bdcad6c2426ba237f8fcc3a8fc293d Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Fri, 28 Jun 2024 22:15:39 +0500
Subject: [PATCH 10/45] fix cache (#2037)

---
 lm_eval/api/model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 7152441b..5eecc617 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -246,9 +246,10 @@ class CachingLM:
         # add hook to lm
         lm.set_cache_hook(self.get_cache_hook())
 
-    def __getattr__(self, attr):
+    def __getattr__(self, attr: str):
         lm_attr = getattr(self.lm, attr)
-        if not callable(lm_attr):
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
             return lm_attr
 
         def fn(requests):
-- 
GitLab


From cc2d3463c2d5aa28f2b26c40d0ff20c878cc56b8 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Fri, 28 Jun 2024 22:21:49 +0500
Subject: [PATCH 11/45] Add chat template to `vllm` (#2034)

* add chat template

* refactor token padding

* nit

* nit

* check on failing test

* check transformers version

* remove transformers pin

* add ids to test

* nit

* fixup

* fix bos bug

* nit

* fixup! fix bos bug

* increase tolerance for table test

* don't detokenize vllm logprobs

* Update lm_eval/models/utils.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* pre-commit run --all-files

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 lm_eval/models/huggingface.py    | 24 ++------------
 lm_eval/models/utils.py          | 51 +++++++++++++++++++++++++++++
 lm_eval/models/vllm_causallms.py | 56 ++++++++++++++++++++++++--------
 lm_eval/tasks/scrolls/task.py    |  6 ++--
 tests/test_evaluator.py          |  4 ++-
 5 files changed, 101 insertions(+), 40 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 46418d9a..8c125f71 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
     Collator,
     clear_torch_cache,
+    configure_pad_token,
     get_dtype,
     pad_and_concat,
     stop_sequences_criteria,
@@ -253,28 +254,7 @@ class HFLM(TemplateLM):
         self.logits_cache = logits_cache
         self.vocab_size = self.tokenizer.vocab_size
         # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
 
         # TODO: override this for Gemma
         self.add_bos_token = add_bos_token
diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 09818f4e..8a81e5de 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger
 
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+
+
 def chunks(iter, n: int = 0, fn=None):
     """
     Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,48 @@ class Collator:
 
         if arr:
             yield arr
+
+
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+
+    Returns:
+        The tokenizer after the padding token has been handled.
+
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+    return tokenizer
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
index 34a63bf2..c8819d7f 100644
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -1,7 +1,7 @@
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 
 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,7 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, configure_pad_token, undistribute
 from lm_eval.utils import (
     eval_logger,
     get_rolling_token_windows,
@@ -26,6 +26,8 @@ try:
 except ModuleNotFoundError:
     pass
 
+if TYPE_CHECKING:
+    pass
 
 eval_logger = eval_logger
 
@@ -118,6 +120,7 @@ class VLLM(TemplateLM):
             trust_remote_code=trust_remote_code,
             tokenizer_revision=tokenizer_revision,
         )
+        self.tokenizer = configure_pad_token(self.tokenizer)
         self.add_bos_token = add_bos_token
         if "gemma" in pretrained.lower():
             self.add_bos_token = True
@@ -176,23 +179,46 @@ class VLLM(TemplateLM):
     def max_gen_toks(self):
         return self._max_gen_toks
 
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
     def tok_encode(
         self,
-        string: str,
-        left_truncate_len=None,
-        add_special_tokens=None,
-        truncation=False,
-    ):
-        """ """
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
         if not add_special_tokens:
             add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
-        )
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids
 
         # left-truncate the encoded context to be at most `left_truncate_len` tokens long
         if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]
 
         return encoding
 
@@ -209,7 +235,7 @@ class VLLM(TemplateLM):
             sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
         else:
             sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
             )
         if self.data_parallel_size > 1:
             # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -290,7 +316,9 @@ class VLLM(TemplateLM):
 
         # batch tokenize contexts
         context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
         requests = [
             ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
         ]
diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py
index bbf1a3c3..2940eb2e 100644
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -132,7 +132,7 @@ class _SCROLLSTask(ConfigurableTask):
 
     def training_docs(self):
         processed_docs = list(map(self._process_doc, self.dataset["train"]))
-        
+
         # Flatten the list of lists since _process_doc returns a list of one element.
         processed_docs = [item for sublist in processed_docs for item in sublist]
         processed_dict = {
@@ -143,13 +143,13 @@ class _SCROLLSTask(ConfigurableTask):
 
     def validation_docs(self):
         processed_docs = list(map(self._process_doc, self.dataset["validation"]))
-        
+
         # Flatten the list of lists since _process_doc returns a list of one element.
         processed_docs = [item for sublist in processed_docs for item in sublist]
         processed_dict = {
             key: [d[key] for d in processed_docs] for key in processed_docs[0]
         }
-        
+
         return Dataset.from_dict(processed_dict)
 
     def should_decontaminate(self):
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index d5edf9ae..71e25555 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -33,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
             10000,
         ),
     ],
+    ids=lambda d: f"{d}",
 )
 def test_evaluator(
     task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
@@ -107,6 +108,7 @@ def test_evaluator(
             "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
         ),
     ],
+    ids=lambda d: f"{d}",
 )
 def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
     results = evaluator.simple_evaluate(
@@ -144,6 +146,6 @@ def test_printed_results(task_name: List[str], limit: int, model: str, model_arg
             try:
                 t1_item = float(t1_item)
                 t2_item = float(t2_item)
-                assert abs(t1_item - t2_item) < 0.1
+                assert abs(t1_item - t2_item) < 0.3
             except ValueError:
                 assert t1_item == t2_item
-- 
GitLab


From 2a6acc88a0c31be7734aec85b17555323b70c049 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Sat, 29 Jun 2024 08:53:58 -0400
Subject: [PATCH 12/45] fail gracefully upon tokenizer logging failure (#2038)

---
 lm_eval/loggers/utils.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py
index 670cdbff..348175a9 100644
--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):
 
 def add_tokenizer_info(storage: Dict[str, Any], lm):
     if getattr(lm, "tokenizer", False):
-        tokenizer_info = {
-            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
-            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
-            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
-            "eot_token_id": getattr(lm, "eot_token_id", None),
-            "max_length": getattr(lm, "max_length", None),
-        }
-        storage.update(tokenizer_info)
-    # seems gguf and textsynth do not have tokenizer
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    lm.tokenizer.pad_token_id,
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    lm.tokenizer.eos_token_id,
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    lm.tokenizer.bos_token_id,
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
     else:
         logger.debug(
             "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
-- 
GitLab


From a8ac0446b854970d7eca5e5c2b612e59c0af707a Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 1 Jul 2024 09:36:15 -0400
Subject: [PATCH 13/45] ship with exact_match function already used ; don't
 call evaluate.load() on import (#2045)

---
 lm_eval/api/metrics.py | 60 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 27b60ed1..1daf3847 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -1,10 +1,11 @@
 import logging
 import math
 import random
+import re
+import string
 from collections.abc import Iterable
 from typing import List
 
-import evaluate as hf_evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics
@@ -166,7 +167,60 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     return items
 
 
-exact_match = hf_evaluate.load("exact_match")
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    score_list = predictions == references
+
+    return {"exact_match": np.mean(score_list)}
+
+
+###
 
 
 @register_metric(
@@ -176,7 +230,7 @@ exact_match = hf_evaluate.load("exact_match")
     aggregation="mean",
 )
 def exact_match_fn(**kwargs):
-    return exact_match.compute(**kwargs)
+    return exact_match_hf_evaluate(**kwargs)
 
 
 @register_metric(
-- 
GitLab


From 3fa4fd725c8a428710109f1d6c14eda37e95baea Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 1 Jul 2024 09:50:22 -0400
Subject: [PATCH 14/45] update to v0.4.3 (#2046)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3818a1a8..040a9b7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lm_eval"
-version = "0.4.2"
+version = "0.4.3"
 authors = [
     {name="EleutherAI", email="contact@eleuther.ai"}
 ]
-- 
GitLab


From e83e891d186f0797640c656501b023104852831d Mon Sep 17 00:00:00 2001
From: Ogundepo Odunayo <ogundepoodunayo@gmail.com>
Date: Mon, 1 Jul 2024 13:16:09 -0400
Subject: [PATCH 15/45] fix wandb logger module import in example (#2041)

---
 examples/visualize-wandb.ipynb | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/visualize-wandb.ipynb b/examples/visualize-wandb.ipynb
index b6ac47ce..d0113d55 100644
--- a/examples/visualize-wandb.ipynb
+++ b/examples/visualize-wandb.ipynb
@@ -110,13 +110,15 @@
    "cell_type": "markdown",
    "id": "e974cabdbe70b667",
    "metadata": {},
-   "source": ""
+   "source": []
   },
   {
    "cell_type": "markdown",
    "id": "5178ca9445b844e4",
    "metadata": {},
-   "source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   "source": [
+    "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   ]
   },
   {
    "cell_type": "code",
@@ -126,7 +128,7 @@
    "outputs": [],
    "source": [
     "import lm_eval\n",
-    "from lm_eval.logging_utils import WandbLogger\n",
+    "from lm_eval.loggers import WandbLogger\n",
     "\n",
     "results = lm_eval.simple_evaluate(\n",
     "    model=\"hf\",\n",
-- 
GitLab


From 9088a68fac0d94ab028a23de47f9d499db70cc62 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Mon, 1 Jul 2024 19:50:26 +0200
Subject: [PATCH 16/45] Fix strip whitespace filter (#2048)

* batch commit

* :Revert "batch commit"

This reverts commit d859d1ca3bc2d6e45979ec51379df119f2a9633d.

* batch commit

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup
---
 lm_eval/filters/extraction.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index 9230808a..41dc6208 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
         def filter_set(inst):
             filtered_resp = []
             for resp in inst:
-                if resp.startswith(" "):
-                    resp = resp[1:]
-
+                resp = resp.lstrip()
                 filtered_resp.append(resp)
-
             return filtered_resp
 
         filtered_resps = [filter_set(resp) for resp in resps]
-- 
GitLab


From 67a990e7345d4ba940e8281ac7c9113ccef2a446 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 1 Jul 2024 22:26:25 -0400
Subject: [PATCH 17/45] update gemma-2 default BOS behavior (#2049)

---
 lm_eval/models/huggingface.py    | 3 +--
 lm_eval/models/vllm_causallms.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 8c125f71..331684f5 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -256,9 +256,8 @@ class HFLM(TemplateLM):
         # select (or create) a pad token to use
         self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
 
-        # TODO: override this for Gemma
         self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) == "gemma":
+        if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
             self.add_bos_token = True
             eval_logger.info(
                 f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
index c8819d7f..1a1067cc 100644
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -125,7 +125,7 @@ class VLLM(TemplateLM):
         if "gemma" in pretrained.lower():
             self.add_bos_token = True
             eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
             )
 
         self.custom_prefix_token_id = prefix_token_id
-- 
GitLab


From 1870ee4e5352ce4c29de23469c2932c356a1b87f Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Wed, 3 Jul 2024 04:54:03 -0400
Subject: [PATCH 18/45] Update hellaswag.yaml (#2029)

---
 lm_eval/tasks/hellaswag/hellaswag.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lm_eval/tasks/hellaswag/hellaswag.yaml b/lm_eval/tasks/hellaswag/hellaswag.yaml
index ec627da7..577ca69e 100644
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -20,3 +20,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
-- 
GitLab


From 3c8db1bb7be5662e4fd5b48a26b6214f758e483f Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Wed, 3 Jul 2024 14:15:49 +0200
Subject: [PATCH 19/45] Adds Open LLM Leaderboard Taks (#2047)

* adds leaderboard tasks

* Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml

* add readme

* Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml

* modify readme

* fix bbh task

* fix bbh salient task

* modify the readme

* Delete lm_eval/tasks/leaderboard/ifeval/README.md

* Delete lm_eval/tasks/leaderboard/math/README.md

* add leaderboard to the tasks repertory

* add anouncment about new leaderbaord tasks

* linting

* Update README.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* installs ifeval dependency in new_task github workflow

---------

Co-authored-by: Nathan Habib <nathan.habib@huggingface.com>
Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 .github/workflows/new_tasks.yml               |    2 +-
 README.md                                     |    1 +
 lm_eval/tasks/README.md                       |    1 +
 lm_eval/tasks/leaderboard/README.md           |  319 ++++
 .../leaderboard/bbh_mc/_fewshot_template_yaml |   17 +
 .../bbh_mc/boolean_expressions.yaml           |   16 +
 .../leaderboard/bbh_mc/causal_judgement.yaml  |   64 +
 .../bbh_mc/date_understanding.yaml            |   57 +
 .../leaderboard/bbh_mc/disambiguation_qa.yaml |   50 +
 .../leaderboard/bbh_mc/formal_fallacies.yaml  |   57 +
 .../leaderboard/bbh_mc/geometric_shapes.yaml  |   86 +
 .../tasks/leaderboard/bbh_mc/hyperbaton.yaml  |   34 +
 .../logical_deduction_five_objects.yaml       |   52 +
 .../logical_deduction_seven_objects.yaml      |   52 +
 .../logical_deduction_three_objects.yaml      |   52 +
 .../bbh_mc/movie_recommendation.yaml          |   54 +
 .../tasks/leaderboard/bbh_mc/navigate.yaml    |   40 +
 .../leaderboard/bbh_mc/object_counting.yaml   |   20 +
 .../bbh_mc/penguins_in_a_table.yaml           |   68 +
 .../reasoning_about_colored_objects.yaml      |  116 ++
 .../tasks/leaderboard/bbh_mc/ruin_names.yaml  |   50 +
 .../salient_translation_error_detection.yaml  |  103 +
 lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml  |   45 +
 .../bbh_mc/sports_understanding.yaml          |   19 +
 .../bbh_mc/temporal_sequences.yaml            |  101 +
 ...racking_shuffled_objects_five_objects.yaml |   59 +
 ...acking_shuffled_objects_seven_objects.yaml |   59 +
 ...acking_shuffled_objects_three_objects.yaml |   59 +
 .../tasks/leaderboard/bbh_mc/web_of_lies.yaml |   22 +
 lm_eval/tasks/leaderboard/gpqa/_template_yaml |   20 +
 .../gpqa/gpqa_diamond_zeroshot.yaml           |    4 +
 .../gpqa/gpqa_extended_zeroshot.yaml          |    4 +
 .../leaderboard/gpqa/gpqa_main_zeroshot.yaml  |    4 +
 lm_eval/tasks/leaderboard/gpqa/utils.py       |   38 +
 lm_eval/tasks/leaderboard/ifeval/ifeval.yaml  |   32 +
 .../tasks/leaderboard/ifeval/instructions.py  | 1612 ++++++++++++++++
 .../ifeval/instructions_registry.py           |  168 ++
 .../leaderboard/ifeval/instructions_util.py   | 1682 +++++++++++++++++
 lm_eval/tasks/leaderboard/ifeval/utils.py     |  134 ++
 lm_eval/tasks/leaderboard/leaderboard.yaml    |    8 +
 lm_eval/tasks/leaderboard/math/_template_yaml |   28 +
 .../tasks/leaderboard/math/math_algebra.yaml  |    3 +
 .../math/math_counting_and_prob.yaml          |    3 +
 .../tasks/leaderboard/math/math_geometry.yaml |    3 +
 .../math/math_intermediate_algebra.yaml       |    3 +
 .../leaderboard/math/math_num_theory.yaml     |    3 +
 .../leaderboard/math/math_prealgebra.yaml     |    3 +
 .../leaderboard/math/math_precalculus.yaml    |    3 +
 lm_eval/tasks/leaderboard/math/utils.py       |  294 +++
 .../tasks/leaderboard/mmlu_pro/mmlu_pro.yaml  |   17 +
 lm_eval/tasks/leaderboard/mmlu_pro/utils.py   |   15 +
 lm_eval/tasks/leaderboard/musr/_template_yaml |   13 +
 .../musr/musr_murder_mysteries.yaml           |    3 +
 .../musr/musr_object_placements.yaml          |    3 +
 .../musr/musr_team_allocation.yaml            |    3 +
 lm_eval/tasks/leaderboard/musr/utils.py       |   26 +
 56 files changed, 5803 insertions(+), 1 deletion(-)
 create mode 100644 lm_eval/tasks/leaderboard/README.md
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/_template_yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/utils.py
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions.py
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions_registry.py
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions_util.py
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/utils.py
 create mode 100644 lm_eval/tasks/leaderboard/leaderboard.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/_template_yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_algebra.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_geometry.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_num_theory.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_prealgebra.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/math_precalculus.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/utils.py
 create mode 100644 lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml
 create mode 100644 lm_eval/tasks/leaderboard/mmlu_pro/utils.py
 create mode 100644 lm_eval/tasks/leaderboard/musr/_template_yaml
 create mode 100644 lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml
 create mode 100644 lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml
 create mode 100644 lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml
 create mode 100644 lm_eval/tasks/leaderboard/musr/utils.py

diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
index 0df7111c..b748aab5 100644
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -56,7 +56,7 @@ jobs:
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
             python -m pip install --upgrade pip
-            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
     #   Install optional git dependencies
     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/README.md b/README.md
index ef00b8a6..ed50e5e5 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 
 New updates and features include:
 
+- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.**
 - Internal refactoring
 - Config-based task creation and configuration
 - Easier import and sharing of externally-defined task config YAMLs
diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index bb04d4f2..5d6393bb 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -56,6 +56,7 @@
 | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
 | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
diff --git a/lm_eval/tasks/leaderboard/README.md b/lm_eval/tasks/leaderboard/README.md
new file mode 100644
index 00000000..70c8a3a1
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/README.md
@@ -0,0 +1,319 @@
+# Leaderboard evaluations
+Our goal with this group is to create an unchanging through time version of
+evaluations that will power the Open LLM Leaderboard on HuggingFace.
+
+As we want to evaluate models across capabilities, the list currently contains:
+- BBH (3-shots, multichoice)
+- GPQA (0-shot, multichoice)
+- mmlu-pro (5-shots, multichoice)
+- Musr (0-shot, multichoice)
+- ifeval (0-shot, generative)
+- Math-lvl-5 (4-shots, generative, minerva version)
+
+
+Details on the choice of those evals can be found [here](https://huggingface.co/spaces/open-llm-leaderboard/blog) !
+
+## BigBenchHard (BBH)
+
+A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
+These are the task for which prior language model evaluations did not
+outperform the average human-rater.
+
+### Paper
+
+Title: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them
+
+BIG-Bench (Srivastava et al., 2022) is a diverse evaluation suite that focuses on tasks believed to be beyond the capabilities of current language models. Language models have already made good progress on this benchmark, with the best model in the BIG-Bench paper outperforming average reported human-rater results on 65% of the BIG-Bench tasks via few-shot prompting. But on what tasks do language models fall short of average human-rater performance, and are those tasks actually unsolvable by current language models?
+In this work, we focus on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average human-rater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves.
+
+
+- paper: https://huggingface.co/papers/2210.09261
+- Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
+
+### Citation
+
+```
+@article{suzgun2022challenging,
+  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
+  author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
+  journal={arXiv preprint arXiv:2210.09261},
+  year={2022}
+}
+```
+
+### Groups
+
+- `leaderboard_bbh`
+
+### Tasks
+
+- `leaderboard_bbh_boolean_expressions`
+- `leaderboard_bbh_causal_judgement`
+- `leaderboard_bbh_date_understanding`
+- `leaderboard_bbh_disambiguation_qa`
+- `leaderboard_bbh_dyck_languages`
+- `leaderboard_bbh_formal_fallacies`
+- `leaderboard_bbh_geometric_shapes`
+- `leaderboard_bbh_hyperbaton`
+- `leaderboard_bbh_logical_deduction_five_objects`
+- `leaderboard_bbh_logical_deduction_seven_objects`
+- `leaderboard_bbh_logical_deduction_three_objects`
+- `leaderboard_bbh_movie_recommendation`
+- `leaderboard_bbh_multistep_arithmetic_two`
+- `leaderboard_bbh_navigate`
+- `leaderboard_bbh_object_counting`
+- `leaderboard_bbh_penguins_in_a_table`
+- `leaderboard_bbh_reasoning_about_colored_objects`
+- `leaderboard_bbh_ruin_names`
+- `leaderboard_bbh_salient_translation_error_detection`
+- `leaderboard_bbh_snarks`
+- `leaderboard_bbh_sports_understanding`
+- `leaderboard_bbh_temporal_sequences`
+- `leaderboard_bbh_tracking_shuffled_objects_five_objects`
+- `leaderboard_bbh_tracking_shuffled_objects_seven_objects`
+- `leaderboard_bbh_tracking_shuffled_objects_three_objects`
+- `leaderboard_bbh_web_of_lies`
+- `leaderboard_bbh_word_sorting`
+
+## GPQA
+
+### Paper
+
+Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
+
+We present GPQA, a challenging dataset of 448 multiple-choice questions written
+by domain experts in biology, physics, and chemistry. We ensure that the
+questions are high-quality and extremely difficult: experts who have or are
+pursuing PhDs in the corresponding domains reach 65% accuracy (74% when
+discounting clear mistakes the experts identified in retrospect), while highly
+skilled non-expert validators only reach 34% accuracy, despite spending on
+average over 30 minutes with unrestricted access to the web (i.e., the
+questions are “Google-proof”). The questions are also difficult for
+state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving
+39% accuracy. If we are to use future AI systems to help us answer very hard
+questions—for example, when developing new scientific knowledge—we need to
+develop scalable oversight methods that enable humans to supervise their
+outputs, which may be difficult even if the supervisors are themselves skilled
+and knowledgeable. The difficulty of GPQA both for skilled non-experts and
+frontier AI systems should enable realistic scalable oversight experiments,
+which we hope can help devise ways for human experts to reliably get truthful
+information from AI systems that surpass human capabilities.
+
+- Paper: https://huggingface.co/papers/2311.12022
+- Homepage: https://github.com/idavidrein/gpqa/tree/main
+
+### Citation
+
+```
+@misc{rein2023gpqa,
+      title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
+      author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
+      year={2023},
+      eprint={2311.12022},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+```
+
+### Groups
+
+- `leaderboard_gpqa`
+
+### Tasks
+
+- `leaderboard_gpqa_extended`
+- `leaderboard_gpqa_diamond`
+- `leaderboard_gpqa_main`
+
+## IFEval
+
+### Paper
+
+Title: Instruction-Following Evaluation for Large Language Models
+
+One core capability of Large Language Models (LLMs) is to follow natural
+language instructions. However, the evaluation of such abilities is not
+standardized: Human evaluations are expensive, slow, and not objectively
+reproducible, while LLM-based auto-evaluation is potentially biased or limited
+by the ability of the evaluator LLM. To overcome these issues, we introduce
+Instruction-Following Eval (IFEval) for large language models. IFEval is a
+straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set
+of "verifiable instructions" such as "write in more than 400 words" and
+"mention the keyword of AI at least 3 times". We identified 25 types of those
+verifiable instructions and constructed around 500 prompts, with each prompt
+containing one or more verifiable instructions. We show evaluation results of
+two widely available LLMs on the market.
+
+- Paper: https://huggingface.co/papers/2210.09261
+- Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval
+
+### Citation
+
+```
+@article{zhou2023instructionfollowing,
+  title={Instruction-Following Evaluation for Large Language Models},
+  author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
+  journal={arXiv preprint arXiv:2311.07911},
+  year={2023},
+}
+```
+
+### Tasks
+
+- `leaderboard_ifeval`
+
+## MATH-hard
+
+This is the 4 shots variant of minerva math but only keeping the level 5 questions.
+
+### Paper
+
+Title: Measuring Mathematical Problem Solving With the MATH Dataset
+
+Many intellectual endeavors require mathematical problem solving, but this
+skill remains beyond the capabilities of computers. To measure this ability in
+machine learning models, we introduce MATH, a new dataset of 12,500 challenging
+competition mathematics problems. Each problem in MATH has a full step-by-step
+solution which can be used to teach models to generate answer derivations and
+explanations.
+
+NOTE: The few-shot and the generated answer extraction is based on the
+[Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is
+calculated using the `sympy` library. This requires additional dependencies,
+which can be installed via the `lm-eval[math]` extra.
+
+- Paper: https://huggingface.co/papers/2103.03874
+- Homepage: https://github.com/hendrycks/math
+
+
+### Citation
+
+```
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+@misc{2206.14858,
+Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dye and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
+Title = {Solving Quantitative Reasoning Problems with Language Models},
+Year = {2022},
+Eprint = {arXiv:2206.14858},
+}
+```
+
+### Groups
+
+- `leaderboard_math_hard`
+
+### Tasks
+
+- `leaderboard_math_algebra_hard`
+- `leaderboard_math_counting_and_prob_hard`
+- `leaderboard_math_geometry_hard`
+- `leaderboard_math_intermediate_algebra_hard`
+- `leaderboard_math_num_theory_hard`
+- `leaderboard_math_prealgebra_hard`
+- `leaderboard_math_precalc_hard`
+
+
+## MMLU-Pro
+
+### Paper
+
+Title: MMLU-Pro: A More Robust and Challenging Multi-Task Language
+Understanding Benchmark
+
+In the age of large-scale language models, benchmarks like the Massive
+Multitask Language Understanding (MMLU) have been pivotal in pushing the
+boundaries of what AI can achieve in language comprehension and reasoning
+across diverse domains. However, as models continue to improve, their
+performance on these benchmarks has begun to plateau, making it increasingly
+difficult to discern differences in model capabilities. This paper introduces
+MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven
+MMLU benchmark by integrating more challenging, reasoning-focused questions and
+expanding the choice set from four to ten options. Additionally, MMLU-Pro
+eliminates the trivial and noisy questions in MMLU. Our experimental results
+show that MMLU-Pro not only raises the challenge, causing a significant drop in
+accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability
+under varying prompts. With 24 different prompt styles tested, the sensitivity
+of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in
+MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT)
+reasoning achieved better performance on MMLU-Pro compared to direct answering,
+which is in stark contrast to the findings on the original MMLU, indicating
+that MMLU-Pro includes more complex reasoning questions. Our assessments
+confirm that MMLU-Pro is a more discriminative benchmark to better track
+progress in the field.
+
+- Paper: https://huggingface.co/papers/2406.01574
+- Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
+
+### Citation
+
+```
+@misc{wang2024mmluprorobustchallengingmultitask,
+      title={MMLU-Pro: A More Robust and Challenging Multi-Task Language
+      Understanding Benchmark},
+      author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
+      year={2024},
+      eprint={2406.01574},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.01574},
+}
+```
+
+## Musr
+
+### Paper
+
+Title: MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
+Reasoning  
+
+While large language models (LLMs) equipped with techniques like
+chain-of-thought prompting have demonstrated impressive capabilities, they
+still fall short in their ability to reason robustly in complex settings.
+However, evaluating LLM reasoning is challenging because system capabilities
+continue to grow while benchmark datasets for tasks like logical deduction have
+remained static. We introduce MuSR, a dataset for evaluating language models on
+multistep soft reasoning tasks specified in a natural language narrative. This
+dataset has two crucial features. First, it is created through a novel
+neurosymbolic synthetic-to-natural generation algorithm, enabling the
+construction of complex reasoning instances that challenge GPT-4 (e.g., murder
+mysteries roughly 1000 words in length) and which can be scaled further as more
+capable LLMs are released. Second, our dataset instances are free text
+narratives corresponding to real-world domains of reasoning; this makes it
+simultaneously much more challenging than other synthetically-crafted
+benchmarks while remaining realistic and tractable for human annotators to
+solve with high accuracy. We evaluate a range of LLMs and prompting techniques
+on this dataset and characterize the gaps that remain for techniques like
+chain-of-thought to perform robust reasoning.
+
+- Paper: https://huggingface.co/papers/2310.16049
+- Homepage: https://zayne-sprague.github.io/MuSR/
+
+### Citation
+
+```
+@misc{sprague2024musrtestinglimitschainofthought,
+      title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
+      Reasoning},
+      author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
+      year={2024},
+      eprint={2310.16049},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2310.16049},
+}
+```
+
+### Groups
+
+- `leaderboard_musr`
+
+### Tasks
+
+- `leaderboard_musr_murder_mysteries`
+- `leaderboard_musr_object_placements`
+- `leaderboard_musr_team_allocation`
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
new file mode 100644
index 00000000..c461f75f
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
@@ -0,0 +1,17 @@
+group: leaderboard_bbh
+dataset_path: SaylorTwift/bbh
+output_type: multiple_choice
+test_split: test
+doc_to_text: 'Q: {{input}}
+
+  A:'
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+num_fewshot: 3
+fewshot_config:
+  sampler: first_n
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml b/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
new file mode 100644
index 00000000..970aa6c1
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
@@ -0,0 +1,16 @@
+dataset_name: boolean_expressions
+description: 'Evaluate the result of a random Boolean expression.
+
+        '
+doc_to_choice: ["False", "True"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: not ( ( not not True ) ) is
+    target: 'False'
+  - input: True and False and not True and True is
+    target: 'False'
+  - input: not not ( not ( False ) ) is
+    target: 'True'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_boolean_expressions
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml b/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml
new file mode 100644
index 00000000..236bc568
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml
@@ -0,0 +1,64 @@
+dataset_name: causal_judgement
+description: 'Answer questions about causal attribution.
+
+        '
+doc_to_choice: ["Yes", "No"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'How would a typical person answer each of the following questions about
+      causation?
+
+      Frank T., had an ongoing dispute with his neighbor over a stretch of land and
+      one day decided to shoot his neighbor in the body. Frank T. had no experience
+      with guns, his hand slipped on the barrel of the gun, and the shot went wild.
+      Nonetheless, the bullet bounced off a large boulder several feet away and hit
+      the neighbor''s body, causing significant injury. Did Frank T. intentionally
+      shoot his neighbor in the body?
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'No'
+  - input: 'How would a typical person answer each of the following questions about
+      causation?
+
+      Suzy and Billy are working on a project that is very important for our nation''s
+      security. The boss tells them both: "Be sure that you are here at exactly 9
+      am. It is absolutely essential that you arrive at that time." Both Billy and
+      Suzy arrive at 9 am. As it happens, there was a motion detector installed in
+      the room where they arrived. The motion detector was set up to be triggered
+      if at least one person appeared in the room at the same time. So the motion
+      detector went off. Did Billy cause the motion detector to go off?
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'Yes'
+  - input: 'How would a typical person answer each of the following questions about
+      causation?
+
+      George and his sister Lena reunite at their parents'' house for Thanksgiving.
+      Whereas George just got into medical school, Lena is unhappy in her marriage
+      and recently lost her job. Over the course of the day, George and Lena get into
+      a number of heated arguments. Later in the afternoon they play a game of darts.
+      They split the first two games, and the third game is close until the end. Who
+      will win comes down to George''s last shot. If he hits a high point region,
+      he wins; if he hits a low point region, Lena wins. George thinks of the difficult
+      time Lena is having, and he really wants to let her win. He aims the dart at
+      the low point region. He sets up his shot and the dart lands in the low point
+      region. After his shot, Lena wins the game and is very happy. Did George hit
+      the low point region intentionally?
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'Yes'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_causal_judgement
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml b/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml
new file mode 100644
index 00000000..bbdf8df6
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml
@@ -0,0 +1,57 @@
+dataset_name: date_understanding
+description: 'Infer the date from context.
+
+        '
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
+
+      Options:
+
+      (A) 12/14/2026
+
+      (B) 12/14/1950
+
+      (C) 12/14/2007
+
+      (D) 12/14/1937
+
+      (E) 07/14/1938
+
+      (F) 12/14/1988'
+    target: (D)
+  - input: 'Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?
+
+      Options:
+
+      (A) 09/04/2018
+
+      (B) 11/11/2018
+
+      (C) 08/25/2018
+
+      (D) 11/02/2018
+
+      (E) 11/04/2018'
+    target: (B)
+  - input: 'Jane and John married on Jan 2, 1958. It is their 5-year anniversary today.
+      What is the date tomorrow in MM/DD/YYYY?
+
+      Options:
+
+      (A) 01/11/1961
+
+      (B) 01/03/1963
+
+      (C) 01/18/1961
+
+      (D) 10/14/1960
+
+      (E) 01/03/1982
+
+      (F) 12/03/1960'
+    target: (B)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_date_understanding
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml b/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml
new file mode 100644
index 00000000..34efa6f6
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml
@@ -0,0 +1,50 @@
+dataset_name: disambiguation_qa
+description: 'Clarify the meaning of sentences with ambiguous pronouns.
+
+        '
+doc_to_choice: ["(A)", "(B)", "(C)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'In the following sentences, explain the antecedent of the pronoun (which
+      thing the pronoun refers to), or state that it is ambiguous.
+
+      Sentence: The chief told the counselor that they took the day off.
+
+      Options:
+
+      (A) The chief took the day off
+
+      (B) The counselor took the day off
+
+      (C) Ambiguous'
+    target: (A)
+  - input: 'In the following sentences, explain the antecedent of the pronoun (which
+      thing the pronoun refers to), or state that it is ambiguous.
+
+      Sentence: The manager sent a message to the secretary, but he didn''t reply
+      yet.
+
+      Options:
+
+      (A) The secretary didn''t reply yet
+
+      (B) The manager didn''t reply yet
+
+      (C) Ambiguous'
+    target: (A)
+  - input: 'In the following sentences, explain the antecedent of the pronoun (which
+      thing the pronoun refers to), or state that it is ambiguous.
+
+      Sentence: Bailey will plan to meet the director at his office
+
+      Options:
+
+      (A) It will be Bailey''s office
+
+      (B) It will be the director''s office
+
+      (C) Ambiguous'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_disambiguation_qa
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml b/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
new file mode 100644
index 00000000..7fb642e5
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
@@ -0,0 +1,57 @@
+dataset_name: formal_fallacies
+description: 'Distinguish deductively valid arguments from formal fallacies.
+
+        '
+doc_to_choice: ["valid", "invalid"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: '"It is not always easy to see who is related to whom -- and in which ways.
+      The following argument pertains to this question: To begin with, Lesley is a
+      close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate
+      of Lowell is sufficient for being a great-grandfather of Leroy. It follows that
+      Lesley is a great-grandfather of Leroy."
+
+      Is the argument, given the explicitly stated premises, deductively valid or
+      invalid?
+
+      Options:
+
+      - valid
+
+      - invalid'
+    target: valid
+  - input: '"It is not always easy to see who is related to whom -- and in which ways.
+      The following argument pertains to this question: Whoever is not a great-grandfather
+      of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient
+      for not being a great-grandfather of Clyde. We may conclude: Everyone who is
+      an ancestor of Dana is a stepbrother of Brian, too."
+
+      Is the argument, given the explicitly stated premises, deductively valid or
+      invalid?
+
+      Options:
+
+      - valid
+
+      - invalid'
+    target: valid
+  - input: '"It is not always easy to grasp who is consuming which products. The following
+      argument pertains to this question: Every infrequent user of Paul Mitchell shampoo
+      is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap,
+      or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo
+      and, in the same time, a loyal buyer of Caress soap. It follows that whoever
+      is an infrequent user of Paul Mitchell shampoo is not a regular consumer of
+      Lush soap."
+
+      Is the argument, given the explicitly stated premises, deductively valid or
+      invalid?
+
+      Options:
+
+      - valid
+
+      - invalid'
+    target: invalid
+include: _fewshot_template_yaml
+task: leaderboard_bbh_formal_fallacies
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml b/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
new file mode 100644
index 00000000..936caece
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
@@ -0,0 +1,86 @@
+dataset_name: geometric_shapes
+description: 'Name geometric shapes from their SVG paths.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00
+      L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/>
+      draws a
+
+      Options:
+
+      (A) circle
+
+      (B) heptagon
+
+      (C) hexagon
+
+      (D) kite
+
+      (E) line
+
+      (F) octagon
+
+      (G) pentagon
+
+      (H) rectangle
+
+      (I) sector
+
+      (J) triangle'
+    target: (F)
+  - input: 'This SVG path element <path d="M 14.19,26.04 L 51.43,39.21 L 58.44,36.69
+      L 56.63,30.17 L 48.53,26.66 L 14.19,26.04"/> draws a
+
+      Options:
+
+      (A) circle
+
+      (B) heptagon
+
+      (C) hexagon
+
+      (D) kite
+
+      (E) line
+
+      (F) octagon
+
+      (G) pentagon
+
+      (H) rectangle
+
+      (I) sector
+
+      (J) triangle'
+    target: (G)
+  - input: 'This SVG path element <path d="M 41.00,43.00 L 37.00,34.00 L 41.00,33.00
+      L 45.00,34.00 L 41.00,43.00"/> draws a
+
+      Options:
+
+      (A) circle
+
+      (B) heptagon
+
+      (C) hexagon
+
+      (D) kite
+
+      (E) line
+
+      (F) octagon
+
+      (G) pentagon
+
+      (H) rectangle
+
+      (I) sector
+
+      (J) triangle'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_geometric_shapes
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml b/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
new file mode 100644
index 00000000..24ddaa73
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
@@ -0,0 +1,34 @@
+dataset_name: hyperbaton
+description: 'Order adjectives correctly in English sentences.
+
+        '
+doc_to_choice: ["(A)", "(B)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Which sentence has the correct adjective order:
+
+      Options:
+
+      (A) rubber terrible ship
+
+      (B) terrible rubber ship'
+    target: (B)
+  - input: 'Which sentence has the correct adjective order:
+
+      Options:
+
+      (A) repulsive small Brazilian exercise ship
+
+      (B) Brazilian repulsive exercise small ship'
+    target: (A)
+  - input: 'Which sentence has the correct adjective order:
+
+      Options:
+
+      (A) blue gold wonderful square shoe
+
+      (B) wonderful square blue gold shoe'
+    target: (B)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_hyperbaton
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml
new file mode 100644
index 00000000..21280253
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml
@@ -0,0 +1,52 @@
+dataset_name: logical_deduction_five_objects
+description: 'A logical deduction task which requires deducing the order of a sequence
+  of objects.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished
+      above Amy. Eli finished below Amy.
+
+      Options:
+
+      (A) Amy finished last
+
+      (B) Eli finished last
+
+      (C) Eve finished last'
+    target: (B)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a white book, a green book, and an orange
+      book. The green book is to the right of the white book. The orange book is the
+      rightmost.
+
+      Options:
+
+      (A) The white book is the leftmost
+
+      (B) The green book is the leftmost
+
+      (C) The orange book is the leftmost'
+    target: (A)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a red book, a gray book, and a white book.
+      The white book is to the left of the gray book. The red book is the second from
+      the left.
+
+      Options:
+
+      (A) The red book is the leftmost
+
+      (B) The gray book is the leftmost
+
+      (C) The white book is the leftmost'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_logical_deduction_five_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
new file mode 100644
index 00000000..11198d6f
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
@@ -0,0 +1,52 @@
+dataset_name: logical_deduction_seven_objects
+description: 'A logical deduction task which requires deducing the order of a sequence
+  of objects.
+
+  '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished
+      above Amy. Eli finished below Amy.
+
+      Options:
+
+      (A) Amy finished last
+
+      (B) Eli finished last
+
+      (C) Eve finished last'
+    target: (B)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a white book, a green book, and an orange
+      book. The green book is to the right of the white book. The orange book is the
+      rightmost.
+
+      Options:
+
+      (A) The white book is the leftmost
+
+      (B) The green book is the leftmost
+
+      (C) The orange book is the leftmost'
+    target: (A)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a red book, a gray book, and a white book.
+      The white book is to the left of the gray book. The red book is the second from
+      the left.
+
+      Options:
+
+      (A) The red book is the leftmost
+
+      (B) The gray book is the leftmost
+
+      (C) The white book is the leftmost'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_logical_deduction_seven_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
new file mode 100644
index 00000000..f2522f3c
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
@@ -0,0 +1,52 @@
+dataset_name: logical_deduction_three_objects
+description: 'A logical deduction task which requires deducing the order of a sequence
+  of objects.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished
+      above Amy. Eli finished below Amy.
+
+      Options:
+
+      (A) Amy finished last
+
+      (B) Eli finished last
+
+      (C) Eve finished last'
+    target: (B)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a white book, a green book, and an orange
+      book. The green book is to the right of the white book. The orange book is the
+      rightmost.
+
+      Options:
+
+      (A) The white book is the leftmost
+
+      (B) The green book is the leftmost
+
+      (C) The orange book is the leftmost'
+    target: (A)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a red book, a gray book, and a white book.
+      The white book is to the left of the gray book. The red book is the second from
+      the left.
+
+      Options:
+
+      (A) The red book is the leftmost
+
+      (B) The gray book is the leftmost
+
+      (C) The white book is the leftmost'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_logical_deduction_three_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
new file mode 100644
index 00000000..908272af
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
@@ -0,0 +1,54 @@
+dataset_name: movie_recommendation
+description: 'Recommend movies similar to the given list of movies.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones
+      and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big
+      Lebowski:
+
+      Options:
+
+      (A) Tetsuo
+
+      (B) the Ironman
+
+      (C) The Princess Bride
+
+      (D) The Barkley Marathons The Race That Eats Its Young
+
+      (E) Bug'
+    target: (C)
+  - input: 'Find a movie similar to Twister, The Silence of the Lambs, Independence
+      Day, Braveheart:
+
+      Options:
+
+      (A) They Shoot Horses
+
+      (B) Don''t They
+
+      (C) Forrest Gump
+
+      (D) The Salton Sea
+
+      (E) Extreme Days'
+    target: (C)
+  - input: 'Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest
+      Gump:
+
+      Options:
+
+      (A) Phenomena
+
+      (B) Lilting
+
+      (C) Catwoman
+
+      (D) Edge of Tomorrow'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_movie_recommendation
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
new file mode 100644
index 00000000..22d0ddf3
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
@@ -0,0 +1,40 @@
+dataset_name: navigate
+description: 'Given a series of navigation instructions, determine whether one would
+  end up back at the starting point.
+
+        '
+doc_to_choice: ["Yes","No"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps.
+      Take 8 steps.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'No'
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps.
+      Take 1 step.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'No'
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'Yes'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_navigate
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
new file mode 100644
index 00000000..9a98c508
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
@@ -0,0 +1,20 @@
+dataset_name: object_counting
+description: 'Questions that involve enumerating objects and asking the model to count
+  them.
+
+        '
+doc_to_choice: ["0","1","2","3","4","5","6","7","8","9","10", "11", "12", "13", "14", "15", "16", "17", "18"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana,
+      a flute, an orange, and a violin. How many fruits do I have?
+    target: '6'
+  - input: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape,
+      a nectarine, and three plums. How many fruits do I have?
+    target: '11'
+  - input: I have a lettuce head, a head of broccoli, an onion, a stalk of celery,
+      two carrots, a garlic, and a yam. How many vegetables do I have?
+    target: '8'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_object_counting
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
new file mode 100644
index 00000000..27687867
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
@@ -0,0 +1,68 @@
+dataset_name: penguins_in_a_table
+description: 'Answer questions about a table of penguins and their attributes.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add
+      a penguin to the table:
+
+      James, 12, 90, 12
+
+      How many penguins are less than 8 years old?
+
+      Options:
+
+      (A) 1
+
+      (B) 2
+
+      (C) 3
+
+      (D) 4
+
+      (E) 5'
+    target: (B)
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is
+      the youngest penguin?
+
+      Options:
+
+      (A) Louis
+
+      (B) Bernard
+
+      (C) Vincent
+
+      (D) Gwen
+
+      (E) James'
+    target: (B)
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is
+      the name of the second penguin sorted by alphabetic order?
+
+      Options:
+
+      (A) Louis
+
+      (B) Bernard
+
+      (C) Vincent
+
+      (D) Gwen
+
+      (E) James'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_penguins_in_a_table
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
new file mode 100644
index 00000000..d4bb5c38
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
@@ -0,0 +1,116 @@
+dataset_name: reasoning_about_colored_objects
+description: 'Answer extremely simple questions about the colors of objects on a surface.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain,
+      a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the
+      stress ball?
+
+      Options:
+
+      (A) red
+
+      (B) orange
+
+      (C) yellow
+
+      (D) green
+
+      (E) blue
+
+      (F) brown
+
+      (G) magenta
+
+      (H) fuchsia
+
+      (I) mauve
+
+      (J) teal
+
+      (K) turquoise
+
+      (L) burgundy
+
+      (M) silver
+
+      (N) gold
+
+      (O) black
+
+      (P) grey
+
+      (Q) purple
+
+      (R) pink'
+    target: (E)
+  - input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip,
+      a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve
+      fidget spinner, and a burgundy pen. What is the color of the object directly
+      to the right of the stress ball?
+
+      Options:
+
+      (A) red
+
+      (B) orange
+
+      (C) yellow
+
+      (D) green
+
+      (E) blue
+
+      (F) brown
+
+      (G) magenta
+
+      (H) fuchsia
+
+      (I) mauve
+
+      (J) teal
+
+      (K) turquoise
+
+      (L) burgundy
+
+      (M) silver
+
+      (N) gold
+
+      (O) black
+
+      (P) grey
+
+      (Q) purple
+
+      (R) pink'
+    target: (F)
+  - input: 'On the nightstand, you see the following items arranged in a row: a teal
+      plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug,
+      a pink notebook, and a grey cup. How many non-orange items do you see to the
+      left of the teal item?
+
+      Options:
+
+      (A) zero
+
+      (B) one
+
+      (C) two
+
+      (D) three
+
+      (E) four
+
+      (F) five
+
+      (G) six'
+    target: (A)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_reasoning_about_colored_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
new file mode 100644
index 00000000..1d865267
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
@@ -0,0 +1,50 @@
+dataset_name: ruin_names
+description: 'Select the humorous edit that ''ruins'' the input movie or musical artist
+  name.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''whitesnake''?
+
+      Options:
+
+      (A) whitesnape
+
+      (B) whitesnapke
+
+      (C) whitesnuake
+
+      (D) mwhitesnake'
+    target: (A)
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''one of our dinosaurs is missing''?
+
+      Options:
+
+      (A) ofne of our dinosaurs is missing
+
+      (B) one af our dinosaurs is missing
+
+      (C) one of our dinosaurs is pissing
+
+      (D) one of our dinosaur is missing'
+    target: (C)
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''counting crows''?
+
+      Options:
+
+      (A) countingy crows
+
+      (B) counting cows
+
+      (C) courting crows
+
+      (D) coutnting crows'
+    target: (B)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_ruin_names
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
new file mode 100644
index 00000000..dd9c9142
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
@@ -0,0 +1,103 @@
+dataset_name: salient_translation_error_detection
+description: 'Detect the type of error in an English translation of a German source
+  sentence.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: In der Liste
+      der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen
+      Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
+
+      Translation: In the list of architectural monuments in Lenzen all architectural
+      monuments of the Brandenburg city of Lenzen and its districts are listed.
+
+      The translation contains an error pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (D)
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: Auf dieser
+      Seite sind die Baudenkm\xE4ler der oberbayerischen Gro\xDFen Kreisstadt Landsberg
+      am Lech zusammengestellt.
+
+      Translation: On this page are compiled the architectural monuments of the town of
+      Landsberg am Lech.
+
+      The translation contains an error pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (E)
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: \u0141eba ist
+      eine Kleinstadt und ein Badeort im Powiat L\u0119borski der polnischen Woiwodschaft
+      Pommern.
+
+      Translation: Eba is not a small town and seaside resort in the Powiat L\xE9borski
+      county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error
+      pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_salient_translation_error_detection
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
new file mode 100644
index 00000000..465e39b1
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
@@ -0,0 +1,45 @@
+dataset_name: snarks
+description: 'Determine which of two sentences is sarcastic.
+
+
+  According to Cambridge University Dictionary, sarcasm is "the use of remarks that
+  clearly mean the opposite of what they say, made in order to hurt someone''s feelings
+  or to criticize something in a humorous way." Sarcastic sentences often contain
+  satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
+
+
+  '
+doc_to_choice: ["(A)","(B)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) Yes, because having interests and actively researching them is a huge waste
+
+      (B) Yes, because having interests and actively researching them is a huge deal'
+    target: (A)
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) No one is going to disagree with you on this. Avoiding ad hominem attacks
+      really help your case
+
+      (B) No one is going to disagree with you on this. Ad hominem attacks really
+      help your case'
+    target: (B)
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) Consistency in the league''s punishments? What do you think this is supposed
+      to be, politics?
+
+      (B) Consistency in the league''s punishments? What do you think this is supposed
+      to be, moral?'
+    target: (A)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_snarks
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
new file mode 100644
index 00000000..f8c1c626
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
@@ -0,0 +1,19 @@
+dataset_name: sports_understanding
+description: 'Determine whether an artificially constructed sentence relating to sports
+  is plausible or not.
+
+        '
+doc_to_choice: ["yes","no"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: Is the following sentence plausible? "Bam Adebayo scored a reverse layup
+      in the Western Conference Finals."
+    target: 'yes'
+  - input: Is the following sentence plausible? "Santi Cazorla scored a touchdown."
+    target: 'no'
+  - input: Is the following sentence plausible? "DeMar DeRozan was called for the
+      goal tend."
+    target: 'yes'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_sports_understanding
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
new file mode 100644
index 00000000..bafedd9c
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
@@ -0,0 +1,101 @@
+dataset_name: temporal_sequences
+description: 'Task description: Answer questions about which times certain events
+  could have occurred.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Today, Emily went to the museum. Between what times could they have gone?
+
+      We know that:
+
+      Emily woke up at 1pm.
+
+      Elizabeth saw Emily reading at the library from 2pm to 4pm.
+
+      Jessica saw Emily watching a movie at the theater from 4pm to 5pm.
+
+      Leslie saw Emily waiting at the airport from 5pm to 6pm.
+
+      William saw Emily buying clothes at the mall from 6pm to 7pm.
+
+      The museum was closed after 7pm.
+
+      Between what times could Emily have gone to the museum?
+
+      Options:
+
+      (A) 1pm to 2pm
+
+      (B) 6pm to 7pm
+
+      (C) 5pm to 6pm
+
+      (D) 2pm to 4pm'
+    target: (A)
+  - input: 'Today, Elizabeth went to the amusement park. Between what times could
+      they have gone?
+
+      We know that:
+
+      Elizabeth woke up at 7am.
+
+      David saw Elizabeth fixing their computer at the electronic store from 1pm to
+      2pm.
+
+      Sarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.
+
+      Susan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.
+
+      Andrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.
+
+      Emily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.
+
+      The amusement park was closed after 10pm.
+
+      Between what times could Elizabeth have gone to the amusement park?
+
+      Options:
+
+      (A) 7am to 1pm
+
+      (B) 9pm to 10pm
+
+      (C) 1pm to 2pm
+
+      (D) 3pm to 6pm'
+    target: (A)
+  - input: 'Today, Tiffany went to the beach. Between what times could they have gone?
+
+      We know that:
+
+      Tiffany woke up at 5am.
+
+      Betty saw Tiffany getting a coffee at the cafe from 5am to 6am.
+
+      Jessica saw Tiffany working at the office from 6am to 9am.
+
+      John saw Tiffany stretching at a yoga studio from 9am to 12pm.
+
+      Sean saw Tiffany sitting on a rooftop from 12pm to 2pm.
+
+      Sarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.
+
+      The beach was closed after 4pm.
+
+      Between what times could Tiffany have gone to the beach?
+
+      Options:
+
+      (A) 9am to 12pm
+
+      (B) 12pm to 2pm
+
+      (C) 5am to 6am
+
+      (D) 3pm to 4pm'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_temporal_sequences
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
new file mode 100644
index 00000000..59667929
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
@@ -0,0 +1,59 @@
+dataset_name: tracking_shuffled_objects_five_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_five_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
new file mode 100644
index 00000000..1492eb61
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
@@ -0,0 +1,59 @@
+dataset_name: tracking_shuffled_objects_seven_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_seven_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
new file mode 100644
index 00000000..134a991b
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
@@ -0,0 +1,59 @@
+dataset_name: tracking_shuffled_objects_three_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_three_objects
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
new file mode 100644
index 00000000..a201d8ff
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
@@ -0,0 +1,22 @@
+dataset_name: web_of_lies
+description: 'Evaluate a random boolean function expressed as a word problem.
+
+        '
+doc_to_choice: ["Yes","No"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina
+      says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent
+      lies. Does Raymond tell the truth?'
+    target: 'Yes'
+  - input: 'Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie
+      tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell
+      the truth?'
+    target: 'Yes'
+  - input: 'Question: Kristian tells the truth. Michaela says Kristian lies. Raymond
+      says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says
+      Osvaldo tells the truth. Does Jamey tell the truth?'
+    target: 'No'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_web_of_lies
diff --git a/lm_eval/tasks/leaderboard/gpqa/_template_yaml b/lm_eval/tasks/leaderboard/gpqa/_template_yaml
new file mode 100644
index 00000000..83cb1872
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/_template_yaml
@@ -0,0 +1,20 @@
+dataset_path: Idavidrein/gpqa
+group: leaderboard_gpqa
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer: "
+doc_to_target: answer
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
+num_fewshot: 0
+metric_list:
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+fewshot_config:
+  sampler: first_n
diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml
new file mode 100644
index 00000000..a3892165
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: gpqa_diamond
+include: _template_yaml
+task: leaderboard_gpqa_diamond
diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml
new file mode 100644
index 00000000..b481d960
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: gpqa_extended
+include: _template_yaml
+task: leaderboard_gpqa_extended
diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml
new file mode 100644
index 00000000..a15e45b6
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml
@@ -0,0 +1,4 @@
+# Generated by _generate_configs.py
+dataset_name: gpqa_main
+include: _template_yaml
+task: leaderboard_gpqa_main
diff --git a/lm_eval/tasks/leaderboard/gpqa/utils.py b/lm_eval/tasks/leaderboard/gpqa/utils.py
new file mode 100644
index 00000000..c2317e02
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/utils.py
@@ -0,0 +1,38 @@
+import random
+import re
+
+import datasets
+
+
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "answer": f"({chr(65 + correct_answer_index)})",
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
diff --git a/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
new file mode 100644
index 00000000..da394d9c
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: leaderboard_ifeval
+group: leaderboard_instruction_following
+dataset_path: wis-k/instruction-following-eval
+dataset_name: null
+output_type: generate_until
+test_split: train
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n
diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions.py b/lm_eval/tasks/leaderboard/ifeval/instructions.py
new file mode 100644
index 00000000..5c352af3
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/instructions.py
@@ -0,0 +1,1612 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library of instructions."""
+
+import collections
+import json
+import logging
+import random
+import re
+import string
+from typing import Dict, Optional, Sequence, Union
+
+import langdetect
+
+from lm_eval.tasks.ifeval import instructions_util
+
+
+logger = logging.getLogger(__name__)
+
+_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
+
+_LANGUAGES = instructions_util.LANGUAGE_CODES
+
+# The relational operation for comparison.
+_COMPARISON_RELATION = ("less than", "at least")
+
+# The maximum number of sentences.
+_MAX_NUM_SENTENCES = 20
+
+# The number of placeholders.
+_NUM_PLACEHOLDERS = 4
+
+# The number of bullet lists.
+_NUM_BULLETS = 5
+
+# The options of constrained response.
+_CONSTRAINED_RESPONSE_OPTIONS = (
+    "My answer is yes.",
+    "My answer is no.",
+    "My answer is maybe.",
+)
+
+# The options of starter keywords.
+_STARTER_OPTIONS = (
+    "I would say",
+    "My answer is",
+    "I believe",
+    "In my opinion",
+    "I think",
+    "I reckon",
+    "I feel",
+    "From my perspective",
+    "As I see it",
+    "According to me",
+    "As far as I'm concerned",
+    "To my understanding",
+    "In my view",
+    "My take on it is",
+    "As per my perception",
+)
+
+# The options of ending keywords.
+# TODO(jeffreyzhou) add more ending options
+_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?")
+
+# The number of highlighted sections.
+_NUM_HIGHLIGHTED_SECTIONS = 4
+
+# The section spliter.
+_SECTION_SPLITER = ("Section", "SECTION")
+
+# The number of sections.
+_NUM_SECTIONS = 5
+
+# The number of paragraphs.
+_NUM_PARAGRAPHS = 5
+
+# The postscript marker.
+_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
+
+# The number of keywords.
+_NUM_KEYWORDS = 2
+
+# The occurrences of a single keyword.
+_KEYWORD_FREQUENCY = 3
+
+# The occurrences of a single letter.
+_LETTER_FREQUENCY = 10
+
+# The occurrences of words with all capital letters.
+_ALL_CAPITAL_WORD_FREQUENCY = 20
+
+# The number of words in the response.
+_NUM_WORDS_LOWER_LIMIT = 100
+_NUM_WORDS_UPPER_LIMIT = 500
+
+
+class Instruction:
+    """An instruction template."""
+
+    def __init__(self, instruction_id):
+        self.id = instruction_id
+
+    def build_description(self, **kwargs):
+        raise NotImplementedError("`build_description` not implemented.")
+
+    def get_instruction_args(self):
+        raise NotImplementedError("`get_instruction_args` not implemented.")
+
+    def get_instruction_args_keys(self):
+        raise NotImplementedError("`get_instruction_args_keys` not implemented.")
+
+    def check_following(self, value):
+        raise NotImplementedError("`check_following` not implemented.")
+
+
+class ResponseLanguageChecker(Instruction):
+    """Check the language of the entire response."""
+
+    def build_description(self, *, language=None):
+        """Build the instruction description.
+
+        Args:
+          language: A string representing the expected language of the response. The
+            language has to comply to the 97 types defined in
+            `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
+            ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
+            for example, `en` for English, `zh` for Chinese, `fr` for French.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._language = language
+        if self._language is None:
+            self._language = random.choice(list(_LANGUAGES.keys()))
+        # TODO(tianjianlu): opens the description generation to more choices.
+        self._description_pattern = (
+            "Your ENTIRE response should be in {language} language, no other "
+            + "language is allowed."
+        )
+        return self._description_pattern.format(language=_LANGUAGES[self._language])
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"language": self._language}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["language"]
+
+    def check_following(self, value):
+        """Check if the language of the entire response follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the language of `value` follows instruction; otherwise False.
+        """
+        assert isinstance(value, str)
+
+        try:
+            return langdetect.detect(value) == self._language
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error(
+                "Unable to detect language for text %s due to %s", value, e
+            )  # refex: disable=pytotw.037
+            return True
+
+
+class NumberOfSentences(Instruction):
+    """Check the number of sentences."""
+
+    def build_description(self, *, num_sentences=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          num_sentences: An integer specifying the number of sentences as a
+            threshold.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of sentences < the threshold;
+            if 'at least', the actual number of sentences >= the threshold.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        # The number of sentences as a threshold for comparison.
+        self._num_sentences_threshold = num_sentences
+        if self._num_sentences_threshold is None or self._num_sentences_threshold < 0:
+            self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = (
+            "Your response should contain {relation} {num_sentences} sentences."
+        )
+        return self._description_pattern.format(
+            relation=self._comparison_relation,
+            num_sentences=self._num_sentences_threshold,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_sentences": self._num_sentences_threshold,
+            "relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_sentences", "relation"]
+
+    def check_following(self, value):
+        """Check if the number of sentences follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the response follows the instruction.
+
+        Raise:
+            ValueError if the string in `instruction_args` is not in
+            [`less_than`, `at_least`].
+        """
+        num_sentences = instructions_util.count_sentences(value)
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return num_sentences < self._num_sentences_threshold
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return num_sentences >= self._num_sentences_threshold
+
+
+class PlaceholderChecker(Instruction):
+    """Check the placeholders in template writing."""
+
+    def build_description(self, *, num_placeholders=None):
+        """Build the instruction description.
+
+        Args:
+          num_placeholders: An integer denoting the minimum number of
+            placeholders required in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_placeholders = num_placeholders
+        if self._num_placeholders is None or self._num_placeholders < 0:
+            self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
+        self._description_pattern = (
+            "The response must contain at least {num_placeholders} placeholders "
+            + "represented by square brackets, such as [address]."
+        )
+        return self._description_pattern.format(num_placeholders=self._num_placeholders)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_placeholders": self._num_placeholders}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_placeholders"]
+
+    def check_following(self, value):
+        """Check if the number of placeholders follows the instruction.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the actual number of placeholders in the response is greater than
+          or equal to `num_placeholders`; otherwise, False.
+        """
+        placeholders = re.findall(r"\[.*?\]", value)
+        num_placeholders = len(placeholders)
+        return num_placeholders >= self._num_placeholders
+
+
+class BulletListChecker(Instruction):
+    """Checks the bullet list in the prompt."""
+
+    def build_description(self, *, num_bullets=None):
+        """Build the instruction description.
+
+        Args:
+          num_bullets: An integer specifying the exact number of bullet lists
+            that is required to appear in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_bullets = num_bullets
+        if self._num_bullets is None or self._num_bullets < 0:
+            self._num_bullets = random.randint(1, _NUM_BULLETS)
+        self._description_pattern = (
+            "Your answer must contain exactly {num_bullets} bullet points. "
+            + "Use the markdown bullet points such as:\n"
+            + "* This is point 1. \n"
+            + "* This is point 2"
+        )
+        return self._description_pattern.format(num_bullets=self._num_bullets)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_bullets": self._num_bullets}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_bullets"]
+
+    def check_following(self, value):
+        r"""Check if the number of bullet lists meets the requirement.
+
+        Args:
+          value: A string representing the response. The response is expected to
+            contain some bullet lists that start with `\*`.
+
+        Returns:
+          True if the actual number of bullet lists in the response meets the
+          requirement.
+        """
+        bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
+        bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
+        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
+        return num_bullet_lists == self._num_bullets
+
+
+class ConstrainedResponseChecker(Instruction):
+    """Checks the constrained response."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        # A sequence of string(s) representing the options of the expected response.
+        self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
+        self._description_pattern = (
+            "Answer with one of the following options: {response_options}"
+        )
+        return self._description_pattern.format(
+            response_options=self._constrained_responses
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response matches the constrained options.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the actual response contains one of the options in the constrained
+          responses; otherwise False.
+        """
+        value = value.strip()
+        for constrained_response in self._constrained_responses:
+            if constrained_response in value:
+                return True
+        return False
+
+
+class ConstrainedStartChecker(Instruction):
+    """Checks the response start."""
+
+    def build_description(self, *, starter=None):
+        """Build the instruction description.
+
+        Args:
+          starter: A string representing the keyward that the response should start
+            with.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._starter = starter.strip() if isinstance(starter, str) else starter
+        if self._starter is None:
+            self._starter = random.choice(_STARTER_OPTIONS)
+        self._description_pattern = (
+            "During the conversation, when it is your turn, "
+            + "please always start with {starter}"
+        )
+        return self._description_pattern.format(starter=self._starter)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"starter": self._starter}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["starter"]
+
+    def check_following(self, value):
+        """Checks if the response starts with the constrained keyword or phrase.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if the response starts with the given phrase or keyword that is
+          contained in `instruction_args`; otherwise, False.
+        """
+        response_pattern = r"^\s*" + self._starter + r".*$"
+        response_with_constrained_start = re.search(
+            response_pattern, value, flags=re.MULTILINE
+        )
+        return True if response_with_constrained_start else False
+
+
+class HighlightSectionChecker(Instruction):
+    """Checks the highlighted section."""
+
+    def build_description(self, *, num_highlights=None):
+        """Build the instruction description.
+
+        Args:
+          num_highlights: An integer specifying the minimum number of highlighted
+            sections.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_highlights = num_highlights
+        if self._num_highlights is None or self._num_highlights < 0:
+            self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
+
+        self._description_pattern = (
+            "Highlight at least {num_highlights} sections in your answer with "
+            + "markdown, i.e. *highlighted section*."
+        )
+
+        return self._description_pattern.format(num_highlights=self._num_highlights)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_highlights": self._num_highlights}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_highlights"]
+
+    def check_following(self, value):
+        """Checks if the number of highlighted sections meets the requirement.
+
+        Args:
+          value: a string repesenting the response. The response is expected to
+            contain highlighted sections in the format of *highlighted*.
+
+        Returns:
+          True if the actual number of highlighted sections in the format of
+          *highlighed sections* meets the minimum requirement; otherwise False.
+        """
+        num_highlights = 0
+        highlights = re.findall(r"\*[^\n\*]*\*", value)
+        double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
+        for highlight in highlights:
+            if highlight.strip("*").strip():
+                num_highlights += 1
+        for highlight in double_highlights:
+            if highlight.removeprefix("**").removesuffix("**").strip():
+                num_highlights += 1
+
+        return num_highlights >= self._num_highlights
+
+
+class SectionChecker(Instruction):
+    """Checks the sections."""
+
+    def build_description(self, *, section_spliter=None, num_sections=None):
+        """Build the instruction description.
+
+        Args:
+          section_spliter: A string represents the section spliter keyword that
+            marks a new section, i.e., `Section` or `SECTION`.
+          num_sections: An integer specifying the number of sections.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._section_spliter = (
+            section_spliter.strip()
+            if isinstance(section_spliter, str)
+            else section_spliter
+        )
+        if self._section_spliter is None:
+            self._section_spliter = random.choice(_SECTION_SPLITER)
+
+        self._num_sections = num_sections
+        if self._num_sections is None or self._num_sections < 0:
+            self._num_sections = random.randint(1, _NUM_SECTIONS)
+
+        self._description_pattern = (
+            "Your response must have {num_sections} sections. Mark the beginning "
+            + "of each section with {section_spliter} X, such as:\n"
+            + "{section_spliter} 1\n"
+            + "[content of section 1]\n"
+            + "{section_spliter} 2\n"
+            + "[content of section 2]"
+        )
+
+        return self._description_pattern.format(
+            num_sections=self._num_sections, section_spliter=self._section_spliter
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "section_spliter": self._section_spliter,
+            "num_sections": self._num_sections,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["section_spliter", "num_sections"]
+
+    def check_following(self, value):
+        """Checks the response contains multiple sections.
+
+        Args:
+          value: A string representing the response. The response is expected
+            to contain multiple sections (number of sections is greater than 1).
+            A new section starts with `Section 1`, where the number denotes the
+            section index.
+
+        Returns:
+          True if the number of sections in the response is greater than or equal to
+          the minimum number of sections; otherwise, False.
+        """
+        section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
+        sections = re.split(section_splitter_patten, value)
+        num_sections = len(sections) - 1
+        return num_sections >= self._num_sections
+
+
+class ParagraphChecker(Instruction):
+    """Checks the paragraphs."""
+
+    def build_description(self, *, num_paragraphs=None):
+        """Build the instruction description.
+
+        Args:
+          num_paragraphs: An integer specifying the number of paragraphs.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_paragraphs = num_paragraphs
+        if self._num_paragraphs is None or self._num_paragraphs < 0:
+            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+        self._description_pattern = (
+            "There should be {num_paragraphs} paragraphs. "
+            + "Paragraphs are separated with the markdown divider: ***"
+        )
+
+        return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_paragraphs": self._num_paragraphs}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_paragraphs"]
+
+    def check_following(self, value):
+        """Checks the response contains required number of paragraphs.
+
+        Args:
+          value: A string representing the response. The response may contain
+            paragraphs that are separated by the markdown divider: `***`.
+
+        Returns:
+          True if the actual number of paragraphs is the same as required;
+          otherwise, False.
+        """
+        paragraphs = re.split(r"\s?\*\*\*\s?", value)
+        num_paragraphs = len(paragraphs)
+
+        for index, paragraph in enumerate(paragraphs):
+            if not paragraph.strip():
+                if index == 0 or index == len(paragraphs) - 1:
+                    num_paragraphs -= 1
+                else:
+                    return False
+
+        return num_paragraphs == self._num_paragraphs
+
+
+class PostscriptChecker(Instruction):
+    """Checks the postscript."""
+
+    def build_description(self, *, postscript_marker=None):
+        """Build the instruction description.
+
+        Args:
+          postscript_marker: A string containing the keyword that marks the start
+            of the postscript section.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._postscript_marker = (
+            postscript_marker.strip()
+            if isinstance(postscript_marker, str)
+            else postscript_marker
+        )
+        if self._postscript_marker is None:
+            self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
+
+        self._description_pattern = (
+            "At the end of your response, please explicitly add a postscript "
+            + "starting with {postscript}"
+        )
+
+        return self._description_pattern.format(postscript=self._postscript_marker)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"postscript_marker": self._postscript_marker}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["postscript_marker"]
+
+    def check_following(self, value):
+        """Checks if the response follows the postscript format.
+
+        Args:
+          value: a string representing the response. The response is expected to
+            contain a postscript section.
+
+        Returns:
+          True if the response contains a postscript section starting with
+          the keyword containing in the `instruction_args`; otherwise False.
+        """
+        value = value.lower()
+        if self._postscript_marker == "P.P.S":
+            postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
+        elif self._postscript_marker == "P.S.":
+            postscript_pattern = r"\s*p\.\s?s\..*$"
+        else:
+            postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
+        postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
+        return True if postscript else False
+
+
+class RephraseChecker(Instruction):
+    """Checks the repharse."""
+
+    def build_description(self, *, original_message):
+        """Build the instruction description.
+
+        Args:
+          original_message: A string representing the original message. The
+            rephrased response should only change its words/sentences in between
+            its two asterisks, for example, *change me*. Both original and rephrased
+            messages should contain the changes in the form of *change me*.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not self.is_change(original_message):
+            raise ValueError(
+                f"Message {original_message} does not contain changes "
+                "in the form of *change me*."
+            )
+
+        self._reference_without_change = original_message
+        self._description = (
+            "Rephrasing: Your rephrased response should only"
+            + "change the words/sentences in between two asterisks"
+            + "such as *change me*."
+        )
+        return self._description
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"original_message": self._reference_without_change}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["original_message"]
+
+    def check_following(self, value):
+        r"""Checks if the rephrasing follows the instruction.
+
+        Args:
+          value: A string representing the response, which is expected to rephras
+            the string of `instruction_args`.
+
+        Returns:
+          True if `value` and `instruction_args` only differ by the words/sentences
+          in between two asterisks such as *change me*; otherwise, False.
+        """
+
+        if not self.is_change(value):
+            raise ValueError(
+                f"value {value} does not contain " "changes in the form of *change me*."
+            )
+
+        response_without_changes = self.strip_changes(value)
+        reference_without_changes = self.strip_changes(self._reference_without_change)
+
+        return response_without_changes == reference_without_changes
+
+    def is_change(self, response):
+        """Check if there is change in the response in the form of *change me*."""
+        return re.search(r"\*.*\*", response)
+
+    def strip_changes(self, response):
+        """Strips off the changes."""
+        return re.sub(r"\*.*\*", "", response)
+
+
+class KeywordChecker(Instruction):
+    """Check the exisitence of certain keywords."""
+
+    def build_description(self, *, keywords=None):
+        """Build the instruction description.
+
+        Args:
+          keywords: A sequence of strings representing the keywords that are
+            expected in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not keywords:
+            self._keywords = instructions_util.generate_keywords(
+                num_keywords=_NUM_KEYWORDS
+            )
+        else:
+            self._keywords = keywords
+        self._keywords = sorted(self._keywords)
+
+        self._description_pattern = "Include keywords {keywords} in the response."
+
+        return self._description_pattern.format(keywords=self._keywords)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"keywords": self._keywords}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["keywords"]
+
+    def check_following(self, value):
+        """Check if the response contain the expected keywords."""
+        for keyword in self._keywords:
+            if not re.search(keyword, value, flags=re.IGNORECASE):
+                return False
+        return True
+
+
+class KeywordFrequencyChecker(Instruction):
+    """Check the keyword frequency."""
+
+    def build_description(self, *, keyword=None, frequency=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          keyword: A string representing a keyword that is expected in the response.
+          frequency: An integer specifying the number of times `keyword` is expected
+            to appear in the response.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of occurrences < frequency;
+            if 'at least', the actual number of occurrences >= frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not keyword:
+            self._keyword = instructions_util.generate_keywords(num_keywords=1)[0]
+        else:
+            self._keyword = keyword.strip()
+
+        self._frequency = frequency
+        if self._frequency is None or self._frequency < 0:
+            self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = (
+            "In your response, the word {keyword} should appear {relation} "
+            + "{frequency} times."
+        )
+
+        return self._description_pattern.format(
+            keyword=self._keyword,
+            relation=self._comparison_relation,
+            frequency=self._frequency,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "keyword": self._keyword,
+            "frequency": self._frequency,
+            "relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["keyword", "frequency", "relation"]
+
+    def check_following(self, value):
+        """Checks if the response contain the keyword with required frequency."""
+        actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE))
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return actual_occurrences < self._frequency
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return actual_occurrences >= self._frequency
+
+
+class NumberOfWords(Instruction):
+    """Checks the number of words."""
+
+    def build_description(self, *, num_words=None, relation=None):
+        """Build the instruction description.
+
+        Args:
+          num_words: An integer specifying the number of words contained in the
+            response.
+          relation: A string in (`less than`, `at least`), defining the relational
+            operator for comparison.
+            Two relational comparisons are supported for now:
+            if 'less than', the actual number of words < num_words;
+            if 'at least', the actual number of words >= num_words.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        self._num_words = num_words
+        if self._num_words is None or self._num_words < 0:
+            self._num_words = random.randint(
+                _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT
+            )
+
+        if relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {relation} is given."
+            )
+        else:
+            self._comparison_relation = relation
+
+        self._description_pattern = "Answer with {relation} {num_words} words."
+
+        return self._description_pattern.format(
+            relation=self._comparison_relation, num_words=self._num_words
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"num_words": self._num_words, "relation": self._comparison_relation}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_words", "relation"]
+
+    def check_following(self, value):
+        """Checks if the response contains the expected number of words."""
+        num_words = instructions_util.count_words(value)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return num_words < self._num_words
+        elif self._comparison_relation == _COMPARISON_RELATION[1]:
+            return num_words >= self._num_words
+
+
+class JsonFormat(Instruction):
+    """Check the Json format."""
+
+    def build_description(self):
+        self._description_pattern = (
+            "Entire output should be wrapped in JSON format. You can use markdown"
+            " ticks such as ```."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        value = (
+            value.strip()
+            .removeprefix("```json")
+            .removeprefix("```Json")
+            .removeprefix("```JSON")
+            .removeprefix("```")
+            .removesuffix("```")
+            .strip()
+        )
+        try:
+            json.loads(value)
+        except ValueError:
+            return False
+        return True
+
+
+class ParagraphFirstWordCheck(Instruction):
+    """Check the paragraph and the first word of the nth paragraph."""
+
+    def build_description(
+        self, num_paragraphs=None, nth_paragraph=None, first_word=None
+    ):
+        r"""Build the instruction description.
+
+        Args:
+          num_paragraphs: An integer indicating the number of paragraphs expected
+            in the response. A paragraph is a subset of the string that is
+            expected to be separated by '\n\n'.
+          nth_paragraph: An integer indicating the paragraph number that we look at.
+            Note that n starts from 1.
+          first_word: A string that represent the first word of the bth paragraph.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._num_paragraphs = num_paragraphs
+        if self._num_paragraphs is None or self._num_paragraphs < 0:
+            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
+
+        self._nth_paragraph = nth_paragraph
+        if (
+            self._nth_paragraph is None
+            or self._nth_paragraph <= 0
+            or self._nth_paragraph > self._num_paragraphs
+        ):
+            self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
+
+        self._first_word = first_word
+        if self._first_word is None:
+            self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
+        self._first_word = self._first_word.lower()
+
+        self._description_pattern = (
+            "There should be {num_paragraphs} paragraphs. "
+            + "Paragraphs and only paragraphs are separated with each other by two "
+            + "new lines as if it was '\\n\\n' in python. "
+            + "Paragraph {nth_paragraph} must start with word {first_word}."
+        )
+
+        return self._description_pattern.format(
+            num_paragraphs=self._num_paragraphs,
+            nth_paragraph=self._nth_paragraph,
+            first_word=self._first_word,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_paragraphs": self._num_paragraphs,
+            "nth_paragraph": self._nth_paragraph,
+            "first_word": self._first_word,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_paragraphs", "nth_paragraph", "first_word"]
+
+    def check_following(self, value):
+        """Checks for required number of paragraphs and correct first word.
+
+        Args:
+          value: a string representing the response. The response may contain
+            paragraphs that are separated by two new lines and the first word of
+            the nth paragraph will have to match a specified word.
+
+        Returns:
+          True if the number of paragraphs is the same as required and the first
+          word of the specified paragraph is the same as required. Otherwise, false.
+        """
+
+        paragraphs = re.split(r"\n\n", value)
+        num_paragraphs = len(paragraphs)
+
+        for paragraph in paragraphs:
+            if not paragraph.strip():
+                num_paragraphs -= 1
+
+        # check that index doesn't go out of bounds
+        if self._nth_paragraph <= num_paragraphs:
+            paragraph = paragraphs[self._nth_paragraph - 1].strip()
+            if not paragraph:
+                return False
+        else:
+            return False
+
+        first_word = ""
+        punctuation = {".", ",", "?", "!", "'", '"'}
+
+        # get first word and remove punctuation
+        word = paragraph.split()[0].strip()
+        # TODO(jeffrey): make more complex?
+        word = word.lstrip("'")
+        word = word.lstrip('"')
+
+        for letter in word:
+            if letter in punctuation:
+                break
+            first_word += letter.lower()
+
+        return num_paragraphs == self._num_paragraphs and first_word == self._first_word
+
+
+# TODO(jeffrey) add relation - at least/at most?
+class KeySentenceChecker(Instruction):
+    """Check the existence of certain key sentences."""
+
+    def build_description(self, key_sentences=None, num_sentences=None):
+        """Build the instruction description.
+
+        Args:
+          key_sentences: A sequences of strings representing the key sentences that
+            are expected in the response.
+          num_sentences: The number of key sentences that are expected to be seen in
+            the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not key_sentences:
+            # TODO(jeffrey) make a generate sentences function? wonderwords package
+            self._key_sentences = set(["For now, this is fine."])
+        else:
+            self._key_sentences = key_sentences
+
+        if not num_sentences:
+            self._num_sentences = random.randint(1, len(self._key_sentences))
+        else:
+            self._num_sentences = num_sentences
+
+        self._description_pattern = (
+            "Include {num_sentences} of the following sentences {key_sentences}"
+        )
+
+        return self._description_pattern.format(
+            num_sentences=self._num_sentences, key_sentences=self._key_sentences
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "num_sentences": self._num_sentences,
+            "key_sentences": list(self._key_sentences),
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["num_sentences", "key_sentences"]
+
+    def check_following(self, value):
+        """Checks if the response contains the expected key sentences."""
+        count = 0
+        sentences = instructions_util.split_into_sentences(value)
+        for sentence in self._key_sentences:
+            if sentence in sentences:
+                count += 1
+
+        return count == self._num_sentences
+
+
+class ForbiddenWords(Instruction):
+    """Checks that specified words are not used in response."""
+
+    def build_description(self, forbidden_words=None):
+        """Build the instruction description.
+
+        Args:
+          forbidden_words: A sequences of strings respresenting words that are not
+            allowed in the response.
+
+        Returns:
+          A string representing the instruction description.
+        """
+
+        if not forbidden_words:
+            self._forbidden_words = instructions_util.generate_keywords(
+                num_keywords=_NUM_KEYWORDS
+            )
+        else:
+            self._forbidden_words = list(set(forbidden_words))
+        self._forbidden_words = sorted(self._forbidden_words)
+        self._description_pattern = (
+            "Do not include keywords {forbidden_words} in the response."
+        )
+
+        return self._description_pattern.format(forbidden_words=self._forbidden_words)
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {"forbidden_words": self._forbidden_words}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["forbidden_words"]
+
+    def check_following(self, value):
+        """Check if the response does not contain the expected keywords."""
+        for word in self._forbidden_words:
+            if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
+                return False
+        return True
+
+
+class RephraseParagraph(Instruction):
+    """Checks that the paragraph is rephrased."""
+
+    def build_description(self, *, original_paragraph, low, high):
+        """Builds the instruction description.
+
+        Args:
+          original_paragraph: A string presenting the original paragraph. The
+            rephrases response should have betweeb low-high words in common.
+          low: An integer presenting the lower bound of similar words.
+          high: An integer representing the upper bound of similar words.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        # TODO(jeffrey) make more encompassing
+        self._original_paragraph = original_paragraph
+        self._low = low
+        self._high = high
+
+        self._description = (
+            "Rephrase the following paragraph: "
+            + "{original_paragraph}\nYour response should have "
+            + "between {low} and {high} of the same words. "
+            + "Words are the same if and only if all of the "
+            + "letters, ignoring cases, are the same. For "
+            + "example, 'run' is the same as 'Run' but different "
+            + "to 'ran'."
+        )
+
+        return self._description.format(
+            original_paragraph=original_paragraph, low=self._low, high=self._high
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return {
+            "original_paragraph": self._original_paragraph,
+            "low": self._low,
+            "high": self._high,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["original_paragraph", "low", "high"]
+
+    def check_following(self, value):
+        val_words = re.findall(r"\w+", value.lower())
+        original_words = re.findall(r"\w+", self._original_paragraph.lower())
+        similar_words = 0
+
+        dict_val = collections.Counter(val_words)
+        dict_original = collections.Counter(original_words)
+
+        for word in dict_original:
+            similar_words += min(dict_original[word], dict_val[word])
+
+        return similar_words >= self._low and similar_words <= self._high
+
+
+class TwoResponsesChecker(Instruction):
+    """Check that two responses were given."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Give two different responses. Responses and only responses should"
+            " be separated by 6 asterisk symbols: ******."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyward args of `build_description`."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response has two different answers.
+
+        Args:
+          value: A string representing the response.
+
+        Returns:
+          True if two responses are detected and false otherwise.
+        """
+        valid_responses = list()
+        responses = value.split("******")
+        for index, response in enumerate(responses):
+            if not response.strip():
+                if index != 0 and index != len(responses) - 1:
+                    return False
+            else:
+                valid_responses.append(response)
+        return (
+            len(valid_responses) == 2
+            and valid_responses[0].strip() != valid_responses[1].strip()
+        )
+
+
+class RepeatPromptThenAnswer(Instruction):
+    """Checks that Prompt is first repeated then answered."""
+
+    def build_description(self, *, prompt_to_repeat=None):
+        """Build the instruction description.
+
+        Args:
+          prompt_to_repeat: The prompt that is meant to be repeated.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if not prompt_to_repeat:
+            raise ValueError("prompt_to_repeat must be set.")
+        else:
+            self._prompt_to_repeat = prompt_to_repeat
+        self._description_pattern = (
+            "First repeat the request word for word without change,"
+            " then give your answer (1. do not say any words or characters"
+            " before repeating the request; 2. the request you need to repeat"
+            " does not include this sentence)"
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return {"prompt_to_repeat": self._prompt_to_repeat}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["prompt_to_repeat"]
+
+    def check_following(self, value):
+        if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
+            return True
+        return False
+
+
+class EndChecker(Instruction):
+    """Checks that the prompt ends with a given phrase."""
+
+    def build_description(self, *, end_phrase=None):
+        """Build the instruction description.
+
+        Args:
+          end_phrase: A string representing the phrase the response should end with.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._end_phrase = (
+            end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
+        )
+        if self._end_phrase is None:
+            self._end_phrase = random.choice(_ENDING_OPTIONS)
+        self._description_pattern = (
+            "Finish your response with this exact phrase {ender}. "
+            "No other words should follow this phrase."
+        )
+        return self._description_pattern.format(ender=self._end_phrase)
+
+    def get_instruction_args(self):
+        return {"end_phrase": self._end_phrase}
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["end_phrase"]
+
+    def check_following(self, value):
+        """Checks if the response ends with the expected phrase."""
+        value = value.strip().strip('"').lower()
+        self._end_phrase = self._end_phrase.strip().lower()
+        return value.endswith(self._end_phrase)
+
+
+class TitleChecker(Instruction):
+    """Checks the response for a title."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Your answer must contain a title, wrapped in double angular brackets,"
+            " such as <<poem of joy>>."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response contains a title."""
+        pattern = r"<<[^\n]+>>"
+        re_pattern = re.compile(pattern)
+        titles = re.findall(re_pattern, value)
+
+        for title in titles:
+            if title.lstrip("<").rstrip(">").strip():
+                return True
+        return False
+
+
+class LetterFrequencyChecker(Instruction):
+    """Checks letter frequency."""
+
+    def build_description(self, *, letter=None, let_frequency=None, let_relation=None):
+        """Build the instruction description.
+
+        Args:
+          letter: A string representing a letter that is expected in the response.
+          let_frequency: An integer specifying the number of times `keyword` is
+            expected to appear in the response.
+          let_relation: A string in (`less than`, `at least`), defining the
+            relational operator for comparison. Two relational comparisons are
+            supported for now; if 'less than', the actual number of
+            occurrences < frequency; if 'at least', the actual number of
+            occurrences >= frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        if (
+            not letter
+            or len(letter) > 1
+            or ord(letter.lower()) < 97
+            or ord(letter.lower()) > 122
+        ):
+            self._letter = random.choice(list(string.ascii_letters))
+        else:
+            self._letter = letter.strip()
+        self._letter = self._letter.lower()
+
+        self._frequency = let_frequency
+        if self._frequency is None or self._frequency < 0:
+            self._frequency = random.randint(1, _LETTER_FREQUENCY)
+
+        if let_relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif let_relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {let_relation} is given."
+            )
+        else:
+            self._comparison_relation = let_relation
+
+        self._description_pattern = (
+            "In your response, the letter {letter} should appear {let_relation}"
+            " {let_frequency} times."
+        )
+
+        return self._description_pattern.format(
+            letter=self._letter,
+            let_frequency=self._frequency,
+            let_relation=self._comparison_relation,
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return {
+            "letter": self._letter,
+            "let_frequency": self._frequency,
+            "let_relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["letter", "let_frequency", "let_relation"]
+
+    def check_following(self, value):
+        """Checks that the response contains the letter at the right frequency."""
+        value = value.lower()
+        letters = collections.Counter(value)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return letters[self._letter] < self._frequency
+        else:
+            return letters[self._letter] >= self._frequency
+
+
+class CapitalLettersEnglishChecker(Instruction):
+    """Checks that the response is in english and is in all capital letters."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Your entire response should be in English, and in all capital letters."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response is in English and in all capital letters."""
+        assert isinstance(value, str)
+
+        try:
+            return value.isupper() and langdetect.detect(value) == "en"
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error(
+                "Unable to detect language for text %s due to %s", value, e
+            )  # refex: disable=pytotw.037
+            return True
+
+
+class LowercaseLettersEnglishChecker(Instruction):
+    """Checks that the response is in english and is in all lowercase letters."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Your entire response should be in English, and in all lowercase"
+            " letters. No capital letters are allowed."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response is in English and in all lowercase letters."""
+        assert isinstance(value, str)
+
+        try:
+            return value.islower() and langdetect.detect(value) == "en"
+        except langdetect.LangDetectException as e:
+            # Count as instruction is followed.
+            logging.error(
+                "Unable to detect language for text %s due to %s", value, e
+            )  # refex: disable=pytotw.037
+            return True
+
+
+class CommaChecker(Instruction):
+    """Checks the response for no commas."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "In your entire response, refrain from the use of any commas."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks that the response does not contain commas."""
+        return not re.search(r"\,", value)
+
+
+class CapitalWordFrequencyChecker(Instruction):
+    """Checks frequency of words with all capital letters."""
+
+    def build_description(
+        self,
+        capital_frequency=None,
+        capital_relation=None,
+    ):
+        """Build the instruction description.
+
+        Args:
+          capital_frequency: An integer that represents the number of words that
+            should be in all capital letters.
+          capital_relation: A string that is 'at least' or 'at most' that refers to
+            the frequency.
+
+        Returns:
+          A string representing the instruction description.
+        """
+        self._frequency = capital_frequency
+        if self._frequency is None:
+            self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
+
+        self._comparison_relation = capital_relation
+        if capital_relation is None:
+            self._comparison_relation = random.choice(_COMPARISON_RELATION)
+        elif capital_relation not in _COMPARISON_RELATION:
+            raise ValueError(
+                "The supported relation for comparison must be in "
+                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
+            )
+
+        self._description_pattern = (
+            "In your response, words with all capital letters should appear"
+            " {relation} {frequency} times."
+        )
+
+        return self._description_pattern.format(
+            frequency=self._frequency, relation=self._comparison_relation
+        )
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return {
+            "capital_frequency": self._frequency,
+            "capital_relation": self._comparison_relation,
+        }
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return ["capital_frequency", "capital_relation"]
+
+    def check_following(self, value):
+        """Checks the frequency of words with all capital letters."""
+        # Hyphenated words will count as one word
+        words = instructions_util.nltk.word_tokenize(value)
+        capital_words = [word for word in words if word.isupper()]
+
+        capital_words = len(capital_words)
+
+        if self._comparison_relation == _COMPARISON_RELATION[0]:
+            return capital_words < self._frequency
+        else:
+            return capital_words >= self._frequency
+
+
+class QuotationChecker(Instruction):
+    """Checks response is wrapped with double quotation marks."""
+
+    def build_description(self):
+        """Build the instruction description."""
+        self._description_pattern = (
+            "Wrap your entire response with double quotation marks."
+        )
+        return self._description_pattern
+
+    def get_instruction_args(self):
+        """Returns the keyword args of build description."""
+        return None
+
+    def get_instruction_args_keys(self):
+        """Returns the args keys of `build_description`."""
+        return []
+
+    def check_following(self, value):
+        """Checks if the response is wrapped with double quotation marks."""
+        value = value.strip()
+        return len(value) > 1 and value[0] == '"' and value[-1] == '"'
diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py b/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py
new file mode 100644
index 00000000..00d9a1de
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py
@@ -0,0 +1,168 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry of all instructions."""
+
+from lm_eval.tasks.ifeval import instructions
+
+
+_KEYWORD = "keywords:"
+
+_LANGUAGE = "language:"
+
+_LENGTH = "length_constraints:"
+
+_CONTENT = "detectable_content:"
+
+_FORMAT = "detectable_format:"
+
+_MULTITURN = "multi-turn:"
+
+_COMBINATION = "combination:"
+
+_STARTEND = "startend:"
+
+_CHANGE_CASES = "change_case:"
+
+_PUNCTUATION = "punctuation:"
+
+INSTRUCTION_DICT = {
+    _KEYWORD + "existence": instructions.KeywordChecker,
+    _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
+    _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
+    _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
+    _LENGTH + "number_sentences": instructions.NumberOfSentences,
+    _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
+    _LENGTH + "number_words": instructions.NumberOfWords,
+    _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
+    _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
+    _CONTENT + "postscript": instructions.PostscriptChecker,
+    _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
+    _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker),
+    _FORMAT + "multiple_sections": instructions.SectionChecker,
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": instructions.JsonFormat,
+    _FORMAT + "title": instructions.TitleChecker,
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
+    _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
+    _STARTEND + "end_checker": instructions.EndChecker,
+    _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
+    _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker,
+    _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
+    _PUNCTUATION + "no_comma": instructions.CommaChecker,
+    _STARTEND + "quotation": instructions.QuotationChecker,
+}
+
+INSTRUCTION_CONFLICTS = {
+    _KEYWORD + "existence": {_KEYWORD + "existence"},
+    _KEYWORD + "frequency": {_KEYWORD + "frequency"},
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
+    _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
+    _LANGUAGE + "response_language": {
+        _LANGUAGE + "response_language",
+        _FORMAT + "multiple_sections",
+        _KEYWORD + "existence",
+        _KEYWORD + "frequency",
+        _KEYWORD + "forbidden_words",
+        _STARTEND + "end_checker",
+        _CHANGE_CASES + "english_capital",
+        _CHANGE_CASES + "english_lowercase",
+    },
+    _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
+    _LENGTH + "number_paragraphs": {
+        _LENGTH + "number_paragraphs",
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_sentences",
+        _LENGTH + "nth_paragraph_first_word",
+    },
+    _LENGTH + "number_words": {_LENGTH + "number_words"},
+    _LENGTH + "nth_paragraph_first_word": {
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_paragraphs",
+    },
+    _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
+    _CONTENT + "postscript": {_CONTENT + "postscript"},
+    _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
+    _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
+    _FORMAT + "multiple_sections": {
+        _FORMAT + "multiple_sections",
+        _LANGUAGE + "response_language",
+        _FORMAT + "number_highlighted_sections",
+    },
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
+    ),
+    _FORMAT + "title": {_FORMAT + "title"},
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
+        {
+            _KEYWORD + "forbidden_words",
+            _KEYWORD + "existence",
+            _LANGUAGE + "response_language",
+            _FORMAT + "title",
+            _PUNCTUATION + "no_comma",
+        }
+    ),
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
+    ),
+    _STARTEND + "end_checker": {_STARTEND + "end_checker"},
+    _CHANGE_CASES + "capital_word_frequency": {
+        _CHANGE_CASES + "capital_word_frequency",
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
+    _CHANGE_CASES + "english_lowercase": {
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
+    _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
+}
+
+
+def conflict_make(conflicts):
+    """Makes sure if A conflicts with B, B will conflict with A.
+
+    Args:
+      conflicts: Dictionary of potential conflicts where key is instruction id
+        and value is set of instruction ids that it conflicts with.
+
+    Returns:
+      Revised version of the dictionary. All instructions conflict with
+      themselves. If A conflicts with B, B will conflict with A.
+    """
+    for key in conflicts:
+        for k in conflicts[key]:
+            conflicts[k].add(key)
+        conflicts[key].add(key)
+    return conflicts
diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions_util.py b/lm_eval/tasks/leaderboard/ifeval/instructions_util.py
new file mode 100644
index 00000000..ccb531f9
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/instructions_util.py
@@ -0,0 +1,1682 @@
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility library of instructions."""
+
+import functools
+import random
+import re
+
+import immutabledict
+import nltk
+
+
+def download_nltk_resources():
+    """Download 'punkt' if not already installed"""
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt")
+
+
+download_nltk_resources()
+
+WORD_LIST = [
+    "western",
+    "sentence",
+    "signal",
+    "dump",
+    "spot",
+    "opposite",
+    "bottom",
+    "potato",
+    "administration",
+    "working",
+    "welcome",
+    "morning",
+    "good",
+    "agency",
+    "primary",
+    "wish",
+    "responsibility",
+    "press",
+    "problem",
+    "president",
+    "steal",
+    "brush",
+    "read",
+    "type",
+    "beat",
+    "trainer",
+    "growth",
+    "lock",
+    "bone",
+    "case",
+    "equal",
+    "comfortable",
+    "region",
+    "replacement",
+    "performance",
+    "mate",
+    "walk",
+    "medicine",
+    "film",
+    "thing",
+    "rock",
+    "tap",
+    "total",
+    "competition",
+    "ease",
+    "south",
+    "establishment",
+    "gather",
+    "parking",
+    "world",
+    "plenty",
+    "breath",
+    "claim",
+    "alcohol",
+    "trade",
+    "dear",
+    "highlight",
+    "street",
+    "matter",
+    "decision",
+    "mess",
+    "agreement",
+    "studio",
+    "coach",
+    "assist",
+    "brain",
+    "wing",
+    "style",
+    "private",
+    "top",
+    "brown",
+    "leg",
+    "buy",
+    "procedure",
+    "method",
+    "speed",
+    "high",
+    "company",
+    "valuable",
+    "pie",
+    "analyst",
+    "session",
+    "pattern",
+    "district",
+    "pleasure",
+    "dinner",
+    "swimming",
+    "joke",
+    "order",
+    "plate",
+    "department",
+    "motor",
+    "cell",
+    "spend",
+    "cabinet",
+    "difference",
+    "power",
+    "examination",
+    "engine",
+    "horse",
+    "dimension",
+    "pay",
+    "toe",
+    "curve",
+    "literature",
+    "bother",
+    "fire",
+    "possibility",
+    "debate",
+    "activity",
+    "passage",
+    "hello",
+    "cycle",
+    "background",
+    "quiet",
+    "author",
+    "effect",
+    "actor",
+    "page",
+    "bicycle",
+    "error",
+    "throat",
+    "attack",
+    "character",
+    "phone",
+    "tea",
+    "increase",
+    "outcome",
+    "file",
+    "specific",
+    "inspector",
+    "internal",
+    "potential",
+    "staff",
+    "building",
+    "employer",
+    "shoe",
+    "hand",
+    "direction",
+    "garden",
+    "purchase",
+    "interview",
+    "study",
+    "recognition",
+    "member",
+    "spiritual",
+    "oven",
+    "sandwich",
+    "weird",
+    "passenger",
+    "particular",
+    "response",
+    "reaction",
+    "size",
+    "variation",
+    "a",
+    "cancel",
+    "candy",
+    "exit",
+    "guest",
+    "condition",
+    "fly",
+    "price",
+    "weakness",
+    "convert",
+    "hotel",
+    "great",
+    "mouth",
+    "mind",
+    "song",
+    "sugar",
+    "suspect",
+    "telephone",
+    "ear",
+    "roof",
+    "paint",
+    "refrigerator",
+    "organization",
+    "jury",
+    "reward",
+    "engineering",
+    "day",
+    "possession",
+    "crew",
+    "bar",
+    "road",
+    "description",
+    "celebration",
+    "score",
+    "mark",
+    "letter",
+    "shower",
+    "suggestion",
+    "sir",
+    "luck",
+    "national",
+    "progress",
+    "hall",
+    "stroke",
+    "theory",
+    "offer",
+    "story",
+    "tax",
+    "definition",
+    "history",
+    "ride",
+    "medium",
+    "opening",
+    "glass",
+    "elevator",
+    "stomach",
+    "question",
+    "ability",
+    "leading",
+    "village",
+    "computer",
+    "city",
+    "grand",
+    "confidence",
+    "candle",
+    "priest",
+    "recommendation",
+    "point",
+    "necessary",
+    "body",
+    "desk",
+    "secret",
+    "horror",
+    "noise",
+    "culture",
+    "warning",
+    "water",
+    "round",
+    "diet",
+    "flower",
+    "bus",
+    "tough",
+    "permission",
+    "week",
+    "prompt",
+    "connection",
+    "abuse",
+    "height",
+    "save",
+    "corner",
+    "border",
+    "stress",
+    "drive",
+    "stop",
+    "rip",
+    "meal",
+    "listen",
+    "confusion",
+    "girlfriend",
+    "living",
+    "relation",
+    "significance",
+    "plan",
+    "creative",
+    "atmosphere",
+    "blame",
+    "invite",
+    "housing",
+    "paper",
+    "drink",
+    "roll",
+    "silver",
+    "drunk",
+    "age",
+    "damage",
+    "smoke",
+    "environment",
+    "pack",
+    "savings",
+    "influence",
+    "tourist",
+    "rain",
+    "post",
+    "sign",
+    "grandmother",
+    "run",
+    "profit",
+    "push",
+    "clerk",
+    "final",
+    "wine",
+    "swim",
+    "pause",
+    "stuff",
+    "singer",
+    "funeral",
+    "average",
+    "source",
+    "scene",
+    "tradition",
+    "personal",
+    "snow",
+    "nobody",
+    "distance",
+    "sort",
+    "sensitive",
+    "animal",
+    "major",
+    "negotiation",
+    "click",
+    "mood",
+    "period",
+    "arrival",
+    "expression",
+    "holiday",
+    "repeat",
+    "dust",
+    "closet",
+    "gold",
+    "bad",
+    "sail",
+    "combination",
+    "clothes",
+    "emphasis",
+    "duty",
+    "black",
+    "step",
+    "school",
+    "jump",
+    "document",
+    "professional",
+    "lip",
+    "chemical",
+    "front",
+    "wake",
+    "while",
+    "inside",
+    "watch",
+    "row",
+    "subject",
+    "penalty",
+    "balance",
+    "possible",
+    "adult",
+    "aside",
+    "sample",
+    "appeal",
+    "wedding",
+    "depth",
+    "king",
+    "award",
+    "wife",
+    "blow",
+    "site",
+    "camp",
+    "music",
+    "safe",
+    "gift",
+    "fault",
+    "guess",
+    "act",
+    "shame",
+    "drama",
+    "capital",
+    "exam",
+    "stupid",
+    "record",
+    "sound",
+    "swing",
+    "novel",
+    "minimum",
+    "ratio",
+    "machine",
+    "shape",
+    "lead",
+    "operation",
+    "salary",
+    "cloud",
+    "affair",
+    "hit",
+    "chapter",
+    "stage",
+    "quantity",
+    "access",
+    "army",
+    "chain",
+    "traffic",
+    "kick",
+    "analysis",
+    "airport",
+    "time",
+    "vacation",
+    "philosophy",
+    "ball",
+    "chest",
+    "thanks",
+    "place",
+    "mountain",
+    "advertising",
+    "red",
+    "past",
+    "rent",
+    "return",
+    "tour",
+    "house",
+    "construction",
+    "net",
+    "native",
+    "war",
+    "figure",
+    "fee",
+    "spray",
+    "user",
+    "dirt",
+    "shot",
+    "task",
+    "stick",
+    "friend",
+    "software",
+    "promotion",
+    "interaction",
+    "surround",
+    "block",
+    "purpose",
+    "practice",
+    "conflict",
+    "routine",
+    "requirement",
+    "bonus",
+    "hole",
+    "state",
+    "junior",
+    "sweet",
+    "catch",
+    "tear",
+    "fold",
+    "wall",
+    "editor",
+    "life",
+    "position",
+    "pound",
+    "respect",
+    "bathroom",
+    "coat",
+    "script",
+    "job",
+    "teach",
+    "birth",
+    "view",
+    "resolve",
+    "theme",
+    "employee",
+    "doubt",
+    "market",
+    "education",
+    "serve",
+    "recover",
+    "tone",
+    "harm",
+    "miss",
+    "union",
+    "understanding",
+    "cow",
+    "river",
+    "association",
+    "concept",
+    "training",
+    "recipe",
+    "relationship",
+    "reserve",
+    "depression",
+    "proof",
+    "hair",
+    "revenue",
+    "independent",
+    "lift",
+    "assignment",
+    "temporary",
+    "amount",
+    "loss",
+    "edge",
+    "track",
+    "check",
+    "rope",
+    "estimate",
+    "pollution",
+    "stable",
+    "message",
+    "delivery",
+    "perspective",
+    "mirror",
+    "assistant",
+    "representative",
+    "witness",
+    "nature",
+    "judge",
+    "fruit",
+    "tip",
+    "devil",
+    "town",
+    "emergency",
+    "upper",
+    "drop",
+    "stay",
+    "human",
+    "neck",
+    "speaker",
+    "network",
+    "sing",
+    "resist",
+    "league",
+    "trip",
+    "signature",
+    "lawyer",
+    "importance",
+    "gas",
+    "choice",
+    "engineer",
+    "success",
+    "part",
+    "external",
+    "worker",
+    "simple",
+    "quarter",
+    "student",
+    "heart",
+    "pass",
+    "spite",
+    "shift",
+    "rough",
+    "lady",
+    "grass",
+    "community",
+    "garage",
+    "youth",
+    "standard",
+    "skirt",
+    "promise",
+    "blind",
+    "television",
+    "disease",
+    "commission",
+    "positive",
+    "energy",
+    "calm",
+    "presence",
+    "tune",
+    "basis",
+    "preference",
+    "head",
+    "common",
+    "cut",
+    "somewhere",
+    "presentation",
+    "current",
+    "thought",
+    "revolution",
+    "effort",
+    "master",
+    "implement",
+    "republic",
+    "floor",
+    "principle",
+    "stranger",
+    "shoulder",
+    "grade",
+    "button",
+    "tennis",
+    "police",
+    "collection",
+    "account",
+    "register",
+    "glove",
+    "divide",
+    "professor",
+    "chair",
+    "priority",
+    "combine",
+    "peace",
+    "extension",
+    "maybe",
+    "evening",
+    "frame",
+    "sister",
+    "wave",
+    "code",
+    "application",
+    "mouse",
+    "match",
+    "counter",
+    "bottle",
+    "half",
+    "cheek",
+    "resolution",
+    "back",
+    "knowledge",
+    "make",
+    "discussion",
+    "screw",
+    "length",
+    "accident",
+    "battle",
+    "dress",
+    "knee",
+    "log",
+    "package",
+    "it",
+    "turn",
+    "hearing",
+    "newspaper",
+    "layer",
+    "wealth",
+    "profile",
+    "imagination",
+    "answer",
+    "weekend",
+    "teacher",
+    "appearance",
+    "meet",
+    "bike",
+    "rise",
+    "belt",
+    "crash",
+    "bowl",
+    "equivalent",
+    "support",
+    "image",
+    "poem",
+    "risk",
+    "excitement",
+    "remote",
+    "secretary",
+    "public",
+    "produce",
+    "plane",
+    "display",
+    "money",
+    "sand",
+    "situation",
+    "punch",
+    "customer",
+    "title",
+    "shake",
+    "mortgage",
+    "option",
+    "number",
+    "pop",
+    "window",
+    "extent",
+    "nothing",
+    "experience",
+    "opinion",
+    "departure",
+    "dance",
+    "indication",
+    "boy",
+    "material",
+    "band",
+    "leader",
+    "sun",
+    "beautiful",
+    "muscle",
+    "farmer",
+    "variety",
+    "fat",
+    "handle",
+    "director",
+    "opportunity",
+    "calendar",
+    "outside",
+    "pace",
+    "bath",
+    "fish",
+    "consequence",
+    "put",
+    "owner",
+    "go",
+    "doctor",
+    "information",
+    "share",
+    "hurt",
+    "protection",
+    "career",
+    "finance",
+    "force",
+    "golf",
+    "garbage",
+    "aspect",
+    "kid",
+    "food",
+    "boot",
+    "milk",
+    "respond",
+    "objective",
+    "reality",
+    "raw",
+    "ring",
+    "mall",
+    "one",
+    "impact",
+    "area",
+    "news",
+    "international",
+    "series",
+    "impress",
+    "mother",
+    "shelter",
+    "strike",
+    "loan",
+    "month",
+    "seat",
+    "anything",
+    "entertainment",
+    "familiar",
+    "clue",
+    "year",
+    "glad",
+    "supermarket",
+    "natural",
+    "god",
+    "cost",
+    "conversation",
+    "tie",
+    "ruin",
+    "comfort",
+    "earth",
+    "storm",
+    "percentage",
+    "assistance",
+    "budget",
+    "strength",
+    "beginning",
+    "sleep",
+    "other",
+    "young",
+    "unit",
+    "fill",
+    "store",
+    "desire",
+    "hide",
+    "value",
+    "cup",
+    "maintenance",
+    "nurse",
+    "function",
+    "tower",
+    "role",
+    "class",
+    "camera",
+    "database",
+    "panic",
+    "nation",
+    "basket",
+    "ice",
+    "art",
+    "spirit",
+    "chart",
+    "exchange",
+    "feedback",
+    "statement",
+    "reputation",
+    "search",
+    "hunt",
+    "exercise",
+    "nasty",
+    "notice",
+    "male",
+    "yard",
+    "annual",
+    "collar",
+    "date",
+    "platform",
+    "plant",
+    "fortune",
+    "passion",
+    "friendship",
+    "spread",
+    "cancer",
+    "ticket",
+    "attitude",
+    "island",
+    "active",
+    "object",
+    "service",
+    "buyer",
+    "bite",
+    "card",
+    "face",
+    "steak",
+    "proposal",
+    "patient",
+    "heat",
+    "rule",
+    "resident",
+    "broad",
+    "politics",
+    "west",
+    "knife",
+    "expert",
+    "girl",
+    "design",
+    "salt",
+    "baseball",
+    "grab",
+    "inspection",
+    "cousin",
+    "couple",
+    "magazine",
+    "cook",
+    "dependent",
+    "security",
+    "chicken",
+    "version",
+    "currency",
+    "ladder",
+    "scheme",
+    "kitchen",
+    "employment",
+    "local",
+    "attention",
+    "manager",
+    "fact",
+    "cover",
+    "sad",
+    "guard",
+    "relative",
+    "county",
+    "rate",
+    "lunch",
+    "program",
+    "initiative",
+    "gear",
+    "bridge",
+    "breast",
+    "talk",
+    "dish",
+    "guarantee",
+    "beer",
+    "vehicle",
+    "reception",
+    "woman",
+    "substance",
+    "copy",
+    "lecture",
+    "advantage",
+    "park",
+    "cold",
+    "death",
+    "mix",
+    "hold",
+    "scale",
+    "tomorrow",
+    "blood",
+    "request",
+    "green",
+    "cookie",
+    "church",
+    "strip",
+    "forever",
+    "beyond",
+    "debt",
+    "tackle",
+    "wash",
+    "following",
+    "feel",
+    "maximum",
+    "sector",
+    "sea",
+    "property",
+    "economics",
+    "menu",
+    "bench",
+    "try",
+    "language",
+    "start",
+    "call",
+    "solid",
+    "address",
+    "income",
+    "foot",
+    "senior",
+    "honey",
+    "few",
+    "mixture",
+    "cash",
+    "grocery",
+    "link",
+    "map",
+    "form",
+    "factor",
+    "pot",
+    "model",
+    "writer",
+    "farm",
+    "winter",
+    "skill",
+    "anywhere",
+    "birthday",
+    "policy",
+    "release",
+    "husband",
+    "lab",
+    "hurry",
+    "mail",
+    "equipment",
+    "sink",
+    "pair",
+    "driver",
+    "consideration",
+    "leather",
+    "skin",
+    "blue",
+    "boat",
+    "sale",
+    "brick",
+    "two",
+    "feed",
+    "square",
+    "dot",
+    "rush",
+    "dream",
+    "location",
+    "afternoon",
+    "manufacturer",
+    "control",
+    "occasion",
+    "trouble",
+    "introduction",
+    "advice",
+    "bet",
+    "eat",
+    "kill",
+    "category",
+    "manner",
+    "office",
+    "estate",
+    "pride",
+    "awareness",
+    "slip",
+    "crack",
+    "client",
+    "nail",
+    "shoot",
+    "membership",
+    "soft",
+    "anybody",
+    "web",
+    "official",
+    "individual",
+    "pizza",
+    "interest",
+    "bag",
+    "spell",
+    "profession",
+    "queen",
+    "deal",
+    "resource",
+    "ship",
+    "guy",
+    "chocolate",
+    "joint",
+    "formal",
+    "upstairs",
+    "car",
+    "resort",
+    "abroad",
+    "dealer",
+    "associate",
+    "finger",
+    "surgery",
+    "comment",
+    "team",
+    "detail",
+    "crazy",
+    "path",
+    "tale",
+    "initial",
+    "arm",
+    "radio",
+    "demand",
+    "single",
+    "draw",
+    "yellow",
+    "contest",
+    "piece",
+    "quote",
+    "pull",
+    "commercial",
+    "shirt",
+    "contribution",
+    "cream",
+    "channel",
+    "suit",
+    "discipline",
+    "instruction",
+    "concert",
+    "speech",
+    "low",
+    "effective",
+    "hang",
+    "scratch",
+    "industry",
+    "breakfast",
+    "lay",
+    "join",
+    "metal",
+    "bedroom",
+    "minute",
+    "product",
+    "rest",
+    "temperature",
+    "many",
+    "give",
+    "argument",
+    "print",
+    "purple",
+    "laugh",
+    "health",
+    "credit",
+    "investment",
+    "sell",
+    "setting",
+    "lesson",
+    "egg",
+    "middle",
+    "marriage",
+    "level",
+    "evidence",
+    "phrase",
+    "love",
+    "self",
+    "benefit",
+    "guidance",
+    "affect",
+    "you",
+    "dad",
+    "anxiety",
+    "special",
+    "boyfriend",
+    "test",
+    "blank",
+    "payment",
+    "soup",
+    "obligation",
+    "reply",
+    "smile",
+    "deep",
+    "complaint",
+    "addition",
+    "review",
+    "box",
+    "towel",
+    "minor",
+    "fun",
+    "soil",
+    "issue",
+    "cigarette",
+    "internet",
+    "gain",
+    "tell",
+    "entry",
+    "spare",
+    "incident",
+    "family",
+    "refuse",
+    "branch",
+    "can",
+    "pen",
+    "grandfather",
+    "constant",
+    "tank",
+    "uncle",
+    "climate",
+    "ground",
+    "volume",
+    "communication",
+    "kind",
+    "poet",
+    "child",
+    "screen",
+    "mine",
+    "quit",
+    "gene",
+    "lack",
+    "charity",
+    "memory",
+    "tooth",
+    "fear",
+    "mention",
+    "marketing",
+    "reveal",
+    "reason",
+    "court",
+    "season",
+    "freedom",
+    "land",
+    "sport",
+    "audience",
+    "classroom",
+    "law",
+    "hook",
+    "win",
+    "carry",
+    "eye",
+    "smell",
+    "distribution",
+    "research",
+    "country",
+    "dare",
+    "hope",
+    "whereas",
+    "stretch",
+    "library",
+    "if",
+    "delay",
+    "college",
+    "plastic",
+    "book",
+    "present",
+    "use",
+    "worry",
+    "champion",
+    "goal",
+    "economy",
+    "march",
+    "election",
+    "reflection",
+    "midnight",
+    "slide",
+    "inflation",
+    "action",
+    "challenge",
+    "guitar",
+    "coast",
+    "apple",
+    "campaign",
+    "field",
+    "jacket",
+    "sense",
+    "way",
+    "visual",
+    "remove",
+    "weather",
+    "trash",
+    "cable",
+    "regret",
+    "buddy",
+    "beach",
+    "historian",
+    "courage",
+    "sympathy",
+    "truck",
+    "tension",
+    "permit",
+    "nose",
+    "bed",
+    "son",
+    "person",
+    "base",
+    "meat",
+    "usual",
+    "air",
+    "meeting",
+    "worth",
+    "game",
+    "independence",
+    "physical",
+    "brief",
+    "play",
+    "raise",
+    "board",
+    "she",
+    "key",
+    "writing",
+    "pick",
+    "command",
+    "party",
+    "yesterday",
+    "spring",
+    "candidate",
+    "physics",
+    "university",
+    "concern",
+    "development",
+    "change",
+    "string",
+    "target",
+    "instance",
+    "room",
+    "bitter",
+    "bird",
+    "football",
+    "normal",
+    "split",
+    "impression",
+    "wood",
+    "long",
+    "meaning",
+    "stock",
+    "cap",
+    "leadership",
+    "media",
+    "ambition",
+    "fishing",
+    "essay",
+    "salad",
+    "repair",
+    "today",
+    "designer",
+    "night",
+    "bank",
+    "drawing",
+    "inevitable",
+    "phase",
+    "vast",
+    "chip",
+    "anger",
+    "switch",
+    "cry",
+    "twist",
+    "personality",
+    "attempt",
+    "storage",
+    "being",
+    "preparation",
+    "bat",
+    "selection",
+    "white",
+    "technology",
+    "contract",
+    "side",
+    "section",
+    "station",
+    "till",
+    "structure",
+    "tongue",
+    "taste",
+    "truth",
+    "difficulty",
+    "group",
+    "limit",
+    "main",
+    "move",
+    "feeling",
+    "light",
+    "example",
+    "mission",
+    "might",
+    "wait",
+    "wheel",
+    "shop",
+    "host",
+    "classic",
+    "alternative",
+    "cause",
+    "agent",
+    "consist",
+    "table",
+    "airline",
+    "text",
+    "pool",
+    "craft",
+    "range",
+    "fuel",
+    "tool",
+    "partner",
+    "load",
+    "entrance",
+    "deposit",
+    "hate",
+    "article",
+    "video",
+    "summer",
+    "feature",
+    "extreme",
+    "mobile",
+    "hospital",
+    "flight",
+    "fall",
+    "pension",
+    "piano",
+    "fail",
+    "result",
+    "rub",
+    "gap",
+    "system",
+    "report",
+    "suck",
+    "ordinary",
+    "wind",
+    "nerve",
+    "ask",
+    "shine",
+    "note",
+    "line",
+    "mom",
+    "perception",
+    "brother",
+    "reference",
+    "bend",
+    "charge",
+    "treat",
+    "trick",
+    "term",
+    "homework",
+    "bake",
+    "bid",
+    "status",
+    "project",
+    "strategy",
+    "orange",
+    "let",
+    "enthusiasm",
+    "parent",
+    "concentrate",
+    "device",
+    "travel",
+    "poetry",
+    "business",
+    "society",
+    "kiss",
+    "end",
+    "vegetable",
+    "employ",
+    "schedule",
+    "hour",
+    "brave",
+    "focus",
+    "process",
+    "movie",
+    "illegal",
+    "general",
+    "coffee",
+    "ad",
+    "highway",
+    "chemistry",
+    "psychology",
+    "hire",
+    "bell",
+    "conference",
+    "relief",
+    "show",
+    "neat",
+    "funny",
+    "weight",
+    "quality",
+    "club",
+    "daughter",
+    "zone",
+    "touch",
+    "tonight",
+    "shock",
+    "burn",
+    "excuse",
+    "name",
+    "survey",
+    "landscape",
+    "advance",
+    "satisfaction",
+    "bread",
+    "disaster",
+    "item",
+    "hat",
+    "prior",
+    "shopping",
+    "visit",
+    "east",
+    "photo",
+    "home",
+    "idea",
+    "father",
+    "comparison",
+    "cat",
+    "pipe",
+    "winner",
+    "count",
+    "lake",
+    "fight",
+    "prize",
+    "foundation",
+    "dog",
+    "keep",
+    "ideal",
+    "fan",
+    "struggle",
+    "peak",
+    "safety",
+    "solution",
+    "hell",
+    "conclusion",
+    "population",
+    "strain",
+    "alarm",
+    "measurement",
+    "second",
+    "train",
+    "race",
+    "due",
+    "insurance",
+    "boss",
+    "tree",
+    "monitor",
+    "sick",
+    "course",
+    "drag",
+    "appointment",
+    "slice",
+    "still",
+    "care",
+    "patience",
+    "rich",
+    "escape",
+    "emotion",
+    "royal",
+    "female",
+    "childhood",
+    "government",
+    "picture",
+    "will",
+    "sock",
+    "big",
+    "gate",
+    "oil",
+    "cross",
+    "pin",
+    "improvement",
+    "championship",
+    "silly",
+    "help",
+    "sky",
+    "pitch",
+    "man",
+    "diamond",
+    "most",
+    "transition",
+    "work",
+    "science",
+    "committee",
+    "moment",
+    "fix",
+    "teaching",
+    "dig",
+    "specialist",
+    "complex",
+    "guide",
+    "people",
+    "dead",
+    "voice",
+    "original",
+    "break",
+    "topic",
+    "data",
+    "degree",
+    "reading",
+    "recording",
+    "bunch",
+    "reach",
+    "judgment",
+    "lie",
+    "regular",
+    "set",
+    "painting",
+    "mode",
+    "list",
+    "player",
+    "bear",
+    "north",
+    "wonder",
+    "carpet",
+    "heavy",
+    "officer",
+    "negative",
+    "clock",
+    "unique",
+    "baby",
+    "pain",
+    "assumption",
+    "disk",
+    "iron",
+    "bill",
+    "drawer",
+    "look",
+    "double",
+    "mistake",
+    "finish",
+    "future",
+    "brilliant",
+    "contact",
+    "math",
+    "rice",
+    "leave",
+    "restaurant",
+    "discount",
+    "sex",
+    "virus",
+    "bit",
+    "trust",
+    "event",
+    "wear",
+    "juice",
+    "failure",
+    "bug",
+    "context",
+    "mud",
+    "whole",
+    "wrap",
+    "intention",
+    "draft",
+    "pressure",
+    "cake",
+    "dark",
+    "explanation",
+    "space",
+    "angle",
+    "word",
+    "efficiency",
+    "management",
+    "habit",
+    "star",
+    "chance",
+    "finding",
+    "transportation",
+    "stand",
+    "criticism",
+    "flow",
+    "door",
+    "injury",
+    "insect",
+    "surprise",
+    "apartment",
+]  # pylint: disable=line-too-long
+
+# ISO 639-1 codes to language names.
+LANGUAGE_CODES = immutabledict.immutabledict(
+    {
+        "en": "English",
+        "es": "Spanish",
+        "pt": "Portuguese",
+        "ar": "Arabic",
+        "hi": "Hindi",
+        "fr": "French",
+        "ru": "Russian",
+        "de": "German",
+        "ja": "Japanese",
+        "it": "Italian",
+        "bn": "Bengali",
+        "uk": "Ukrainian",
+        "th": "Thai",
+        "ur": "Urdu",
+        "ta": "Tamil",
+        "te": "Telugu",
+        "bg": "Bulgarian",
+        "ko": "Korean",
+        "pl": "Polish",
+        "he": "Hebrew",
+        "fa": "Persian",
+        "vi": "Vietnamese",
+        "ne": "Nepali",
+        "sw": "Swahili",
+        "kn": "Kannada",
+        "mr": "Marathi",
+        "gu": "Gujarati",
+        "pa": "Punjabi",
+        "ml": "Malayalam",
+        "fi": "Finnish",
+    }
+)
+
+_ALPHABETS = "([A-Za-z])"
+_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
+_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
+_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
+_DIGITS = "([0-9])"
+_MULTIPLE_DOTS = r"\.{2,}"
+
+
+def split_into_sentences(text):
+    """Split the text into sentences.
+
+    Args:
+      text: A string that consists of more than or equal to one sentences.
+
+    Returns:
+      A list of strings where each string is a sentence.
+    """
+    text = " " + text + "  "
+    text = text.replace("\n", " ")
+    text = re.sub(_PREFIXES, "\\1<prd>", text)
+    text = re.sub(_WEBSITES, "<prd>\\1", text)
+    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
+    text = re.sub(
+        _MULTIPLE_DOTS,
+        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
+        text,
+    )
+    if "Ph.D" in text:
+        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
+    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
+    text = re.sub(
+        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
+        "\\1<prd>\\2<prd>\\3<prd>",
+        text,
+    )
+    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
+    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
+    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
+    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
+    if "”" in text:
+        text = text.replace(".”", "”.")
+    if '"' in text:
+        text = text.replace('."', '".')
+    if "!" in text:
+        text = text.replace('!"', '"!')
+    if "?" in text:
+        text = text.replace('?"', '"?')
+    text = text.replace(".", ".<stop>")
+    text = text.replace("?", "?<stop>")
+    text = text.replace("!", "!<stop>")
+    text = text.replace("<prd>", ".")
+    sentences = text.split("<stop>")
+    sentences = [s.strip() for s in sentences]
+    if sentences and not sentences[-1]:
+        sentences = sentences[:-1]
+    return sentences
+
+
+def count_words(text):
+    """Counts the number of words."""
+    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(text)
+    num_words = len(tokens)
+    return num_words
+
+
+@functools.lru_cache(maxsize=None)
+def _get_sentence_tokenizer():
+    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
+
+
+def count_sentences(text):
+    """Count the number of sentences."""
+    tokenizer = _get_sentence_tokenizer()
+    tokenized_sentences = tokenizer.tokenize(text)
+    return len(tokenized_sentences)
+
+
+def generate_keywords(num_keywords):
+    """Randomly generates a few keywords."""
+    return random.sample(WORD_LIST, k=num_keywords)
diff --git a/lm_eval/tasks/leaderboard/ifeval/utils.py b/lm_eval/tasks/leaderboard/ifeval/utils.py
new file mode 100644
index 00000000..985e8d5a
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/utils.py
@@ -0,0 +1,134 @@
+import dataclasses
+from typing import Dict, Optional, Union
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=doc["kwargs"],
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
diff --git a/lm_eval/tasks/leaderboard/leaderboard.yaml b/lm_eval/tasks/leaderboard/leaderboard.yaml
new file mode 100644
index 00000000..d9c5aaac
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/leaderboard.yaml
@@ -0,0 +1,8 @@
+group: leaderboard
+task:
+  - leaderboard_mmlu_pro
+  - leaderboard_bbh
+  - leaderboard_gpqa
+  - leaderboard_math_hard
+  - leaderboard_ifeval
+  - leaderboard_musr
diff --git a/lm_eval/tasks/leaderboard/math/_template_yaml b/lm_eval/tasks/leaderboard/math/_template_yaml
new file mode 100644
index 00000000..a121074b
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/_template_yaml
@@ -0,0 +1,28 @@
+group:
+  - leaderboard_math_hard
+dataset_path: lighteval/MATH-Hard
+process_docs: !function utils.process_docs
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: "{{answer if few_shot is undefined else solution}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+num_fewshot: 4
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
+fewshot_config:
+  sampler: first_n
+  samples: !function utils.list_fewshot_samples
diff --git a/lm_eval/tasks/leaderboard/math/math_algebra.yaml b/lm_eval/tasks/leaderboard/math/math_algebra.yaml
new file mode 100644
index 00000000..9e1d4753
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_algebra.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: algebra
+task: leaderboard_math_algebra_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml b/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml
new file mode 100644
index 00000000..41767099
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: counting_and_probability
+task: leaderboard_math_counting_and_prob_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_geometry.yaml b/lm_eval/tasks/leaderboard/math/math_geometry.yaml
new file mode 100644
index 00000000..189633c3
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_geometry.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: geometry
+task: leaderboard_math_geometry_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml b/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml
new file mode 100644
index 00000000..9959bd18
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: intermediate_algebra
+task: leaderboard_math_intermediate_algebra_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_num_theory.yaml b/lm_eval/tasks/leaderboard/math/math_num_theory.yaml
new file mode 100644
index 00000000..a055d94d
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_num_theory.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: number_theory
+task: leaderboard_math_num_theory_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml b/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml
new file mode 100644
index 00000000..474d7258
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: prealgebra
+task: leaderboard_math_prealgebra_hard
diff --git a/lm_eval/tasks/leaderboard/math/math_precalculus.yaml b/lm_eval/tasks/leaderboard/math/math_precalculus.yaml
new file mode 100644
index 00000000..4d63541e
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/math_precalculus.yaml
@@ -0,0 +1,3 @@
+include: _template_yaml
+dataset_name: precalculus
+task: leaderboard_math_precalculus_hard
diff --git a/lm_eval/tasks/leaderboard/math/utils.py b/lm_eval/tasks/leaderboard/math/utils.py
new file mode 100644
index 00000000..e3ebcf99
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/utils.py
@@ -0,0 +1,294 @@
+import re
+import signal
+from typing import Dict, List, Optional
+
+import datasets
+
+from lm_eval.utils import eval_logger
+
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    return "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["problem"],
+            "solution": doc["solution"],
+            "answer": normalize_final_answer(
+                remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+        }
+        if getattr(doc, "few_shot", None) is not None:
+            out_doc["few_shot"] = True
+        return out_doc
+
+    return dataset.map(_process_doc)
+
+
+def list_fewshot_samples() -> list[dict]:
+    return [
+        {
+            "problem": "Find the domain of the expression  $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
+            "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.",
+            "few_shot": "1",
+        },
+        {
+            "problem": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$",
+            "solution": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.",
+            "few_shot": "1",
+        },
+        {
+            "problem": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?",
+            "solution": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight.  If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight.  Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.",
+            "few_shot": "1",
+        },
+        {
+            "problem": "If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.",
+            "solution": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.",
+            "few_shot": "1",
+        },
+    ]
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+
+    unnormalized_answer = get_unnormalized_answer(candidates)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer
diff --git a/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml b/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml
new file mode 100644
index 00000000..8ddabf14
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml
@@ -0,0 +1,17 @@
+dataset_path: TIGER-Lab/MMLU-Pro # a copy of `cais/leaderboard_mmlu` with no auxiliary_train split
+task: leaderboard_mmlu_pro
+test_split: test
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_choice: !function utils.doc_to_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+num_fewshot: 5
+metadata:
+  version: 0.1
diff --git a/lm_eval/tasks/leaderboard/mmlu_pro/utils.py b/lm_eval/tasks/leaderboard/mmlu_pro/utils.py
new file mode 100644
index 00000000..bb398262
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/mmlu_pro/utils.py
@@ -0,0 +1,15 @@
+import string
+
+
+def doc_to_text(doc):
+    doc_to_text = f"{doc['question']}\n"
+
+    for i in range(len(doc["options"])):
+        doc_to_text += f"{string.ascii_uppercase[i]}. {doc['options'][i]}\n"
+
+    doc_to_text += "Answer:"
+    return doc_to_text
+
+
+def doc_to_choice(doc):
+    return [string.ascii_uppercase[i] for i in range(len(doc["options"]))]
diff --git a/lm_eval/tasks/leaderboard/musr/_template_yaml b/lm_eval/tasks/leaderboard/musr/_template_yaml
new file mode 100644
index 00000000..2076cb9f
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/_template_yaml
@@ -0,0 +1,13 @@
+group:
+  - leaderboard_musr
+dataset_path: TAUR-Lab/MuSR
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{answer_choice}}"
+doc_to_choice: "{{choices}}"
+metric_list:
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml b/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml
new file mode 100644
index 00000000..88aac33e
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml
@@ -0,0 +1,3 @@
+include: "_template_yaml"
+task: leaderboard_musr_murder_mysteries
+test_split: murder_mysteries
diff --git a/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml b/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml
new file mode 100644
index 00000000..89b29fb7
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml
@@ -0,0 +1,3 @@
+include: "_template_yaml"
+task: leaderboard_musr_object_placements
+test_split: object_placements
diff --git a/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml b/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml
new file mode 100644
index 00000000..8ede2ba9
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml
@@ -0,0 +1,3 @@
+include: "_template_yaml"
+task: leaderboard_musr_team_allocation
+test_split: team_allocation
diff --git a/lm_eval/tasks/leaderboard/musr/utils.py b/lm_eval/tasks/leaderboard/musr/utils.py
new file mode 100644
index 00000000..1d0a7d1c
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/utils.py
@@ -0,0 +1,26 @@
+import ast
+
+
+def doc_to_choice(doc):
+    """
+    Convert a doc to a choice.
+    """
+    return ast.literal_eval(doc["choices"])
+
+
+DOC_TO_TEXT = "{narrative}\n\n" "{question}\n\n" "{choices}\n" "Answer:"
+
+
+def doc_to_text(doc):
+    """
+    Convert a doc to text.
+    """
+    choices = ""
+    for i, choice in enumerate(ast.literal_eval(doc["choices"])):
+        choices += f"{i+1} - {choice}\n"
+
+    text = DOC_TO_TEXT.format(
+        narrative=doc["narrative"], question=doc["question"], choices=choices
+    )
+
+    return text
-- 
GitLab


From d855d0baf8576296e790d0c9477b40a710d28e67 Mon Sep 17 00:00:00 2001
From: Hanwool Albert Lee <88315152+h-albert-lee@users.noreply.github.com>
Date: Wed, 3 Jul 2024 21:17:15 +0900
Subject: [PATCH 20/45] #1442 inverse scaling tasks implementation (#1589)

* initial_implementation (test has to be proceeded)

* minor fix

* revised task name and implemented new task

* minor fixes

* new tasks implement

* minor fix

* added 'prompt injection' task

* delete prompt injection task (will be implemented at next PR)

* trust remote code

* Update lm_eval/tasks/inverse_scaling/README.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* added readme

* Update lm_eval/tasks/README.md

* Update lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml

* Update lm_eval/tasks/inverse_scaling/README.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update README.md

* precommit?

* run precommit on readme

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
---
 lm_eval/tasks/README.md                       |  1 +
 lm_eval/tasks/inverse_scaling/README.md       | 55 +++++++++++++++++++
 .../inverse_scaling/_inverse_scaling_mc_yaml  | 17 ++++++
 .../inverse_scaling_hindsight_neglect.yaml    |  3 +
 .../inverse_scaling_into_the_unknown.yaml     |  3 +
 .../inverse_scaling_memo_trap.yaml            |  3 +
 .../inverse_scaling_modus_tollens.yaml        |  3 +
 .../inverse_scaling/inverse_scaling_neqa.yaml |  3 +
 ..._scaling_pattern_matching_suppression.yaml |  3 +
 .../inverse_scaling_quote_repetition.yaml     |  3 +
 .../inverse_scaling_redefine_math.yaml        |  3 +
 .../inverse_scaling_repetitive_algebra.yaml   |  3 +
 .../inverse_scaling_sig_figs.yaml             |  3 +
 ...verse_scaling_winobias_antistereotype.yaml | 20 +++++++
 14 files changed, 123 insertions(+)
 create mode 100644 lm_eval/tasks/inverse_scaling/README.md
 create mode 100644 lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 5d6393bb..1eb7fdc5 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -49,6 +49,7 @@
 | [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
 | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
diff --git a/lm_eval/tasks/inverse_scaling/README.md b/lm_eval/tasks/inverse_scaling/README.md
new file mode 100644
index 00000000..10364d8e
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/README.md
@@ -0,0 +1,55 @@
+# inverse_scaling
+
+### Paper
+
+Title: `Inverse Scaling: When Bigger Isn't Better`
+
+Abstract: `Work on scaling laws has found that large language models (LMs) show predictable improvements to overall loss with increased scale (model size, training data, and compute). Here, we present evidence for the claim that LMs may show inverse scaling, or worse task performance with increased scale, e.g., due to flaws in the training objective and data. We present empirical evidence of inverse scaling on 11 datasets collected by running a public contest, the Inverse Scaling Prize, with a substantial prize pool. Through analysis of the datasets, along with other examples found in the literature, we identify four potential causes of inverse scaling: (i) preference to repeat memorized sequences over following in-context instructions, (ii) imitation of undesirable patterns in the training data, (iii) tasks containing an easy distractor task which LMs could focus on, rather than the harder real task, and (iv) correct but misleading few-shot demonstrations of the task. We release the winning datasets at this https URL to allow for further investigation of inverse scaling. Our tasks have helped drive the discovery of U-shaped and inverted-U scaling trends, where an initial trend reverses, suggesting that scaling trends are less reliable at predicting the behavior of larger-scale models than previously understood. Overall, our results suggest that there are tasks for which increased model scale alone may not lead to progress, and that more careful thought needs to go into the data and objectives for training language models.`
+
+Note: This is not official implementation of inverse scaling prize. Implemented by h-albert-lee with permission from the authors of the paper.
+
+Homepage: https://github.com/inverse-scaling/prize
+
+### Citation
+
+@article{mckenzie2023inverse,
+      title={Inverse Scaling: When Bigger Isn't Better},
+      author={Ian R. McKenzie and Alexander Lyzhov and Michael Pieler and Alicia Parrish and Aaron Mueller and Ameya Prabhu and Euan McLean and Aaron Kirtland and Alexis Ross and Alisa Liu and Andrew Gritsevskiy and Daniel Wurgaft and Derik Kauffman and Gabriel Recchia and Jiacheng Liu and Joe Cavanagh and Max Weiss and Sicong Huang and The Floating Droid and Tom Tseng and Tomasz Korbak and Xudong Shen and Yuhui Zhang and Zhengping Zhou and Najoung Kim and Samuel R. Bowman and Ethan Perez},
+      journal={arXiv preprint arXiv:2306.09479},
+      year={2023}
+}
+
+### Groups and Tasks
+
+#### Groups
+
+* `inverse_scaling_mc`: all tasks of Inverse Scaling Prize (currently aside from Prompt Injection), matching their implementations on OPT for multiple-choice type classification tasks. **These match the published dataset versions from the prize, which may slightly differ from numbers in the paper (but have been tested for equivalence to the OPT numbers reported at https://huggingface.co/inverse-scaling/opt-1.3b_eval for multiple sizes.**
+
+
+#### Tasks
+
+- `inverse_scaling_hindsight_neglect_10shot`
+- `inverse_scaling_redefine_math`
+- `inverse_scaling_quote_repetition`
+- `inverse_scaling_neqa`
+- `inverse_scaling_winobias_antistereotype`: not an official Inverse Scaling prize winner, but eval results reported on it at https://huggingface.co/inverse-scaling/opt-1.3b_eval .
+- `inverse_scaling_into_the_unknown`
+- `inverse_scaling_memo_trap`
+- `inverse_scaling_modus_tollens`
+- `inverse_scaling_pattern_matching_suppression`
+- `inverse_scaling_repetitive_algebra`
+- `inverse_scaling_sig_figs`
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
new file mode 100644
index 00000000..4dc3df7a
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
@@ -0,0 +1,17 @@
+group:
+  - inverse_scaling_mc
+output_type: multiple_choice
+test_split: train
+doc_to_text: prompt
+doc_to_choice: classes
+doc_to_target: answer_index
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml
new file mode 100644
index 00000000..b21baee2
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_hindsight_neglect_10shot
+dataset_path: inverse-scaling/hindsight-neglect-10shot
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml
new file mode 100644
index 00000000..0c7ccd3b
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_into_the_unknown
+dataset_path: Albertmade/into-the-unknown
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml
new file mode 100644
index 00000000..5c77ffc2
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_memo_trap
+dataset_path: Albertmade/memo-trap
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml
new file mode 100644
index 00000000..89e1ebbc
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_modus_tollens
+dataset_path: Albertmade/modus-tollens
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml
new file mode 100644
index 00000000..11a057d8
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_neqa
+dataset_path: inverse-scaling/NeQA
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml
new file mode 100644
index 00000000..b415bba1
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_pattern_matching_suppression
+dataset_path: Albertmade/pattern-matching-suppression
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml
new file mode 100644
index 00000000..6750953c
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_quote_repetition
+dataset_path: inverse-scaling/quote-repetition
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml
new file mode 100644
index 00000000..2f14ba28
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_redefine_math
+dataset_path: inverse-scaling/redefine-math
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml
new file mode 100644
index 00000000..048ea271
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_repetitive_algebra
+dataset_path: Albertmade/repetitive-algebra
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml
new file mode 100644
index 00000000..25d86e33
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml
@@ -0,0 +1,3 @@
+include: _inverse_scaling_mc_yaml
+task: inverse_scaling_sig_figs
+dataset_path: Albertmade/sig-figs
diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml
new file mode 100644
index 00000000..4c5ff300
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml
@@ -0,0 +1,20 @@
+group:
+  - inverse_scaling_mc
+task: inverse_scaling_winobias_antistereotype
+dataset_path: mathemakitten/winobias_antistereotype_test_v5
+output_type: multiple_choice
+test_split: test
+doc_to_text: text
+doc_to_choice: classes
+doc_to_target: target
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+dataset_kwargs:
+  trust_remote_code: true
+metadata:
+  version: 0
-- 
GitLab


From 5a7ed3eee00eca378f157abbe9baf4d406f063f2 Mon Sep 17 00:00:00 2001
From: Choyunhui <a01022371341@gmail.com>
Date: Mon, 8 Jul 2024 16:34:01 +0900
Subject: [PATCH 21/45] Fix TypeError in samplers.py by converting int to str
 (#2074)

Co-authored-by: yhjo <yhjo@suresofttech.com>
---
 lm_eval/api/samplers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/api/samplers.py b/lm_eval/api/samplers.py
index 74513bbc..ac75c722 100644
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -55,7 +55,7 @@ class ContextSampler:
             labeled_examples += (
                 str(doc_target[0])
                 if isinstance(doc_target, list)
-                else doc_target
+                else str(doc_target)
                 if self.config.doc_to_choice is None or isinstance(doc_target, str)
                 else str(self.doc_to_choice(doc)[doc_target])
             )
-- 
GitLab


From 517aadc4d05507e2fea7b7753e91e6ad7dd76847 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Mon, 8 Jul 2024 22:12:35 +0700
Subject: [PATCH 22/45] Group agg rework (#1741)

* add greoup_config arg

* add a group config that allows disabling table for group score and group aggregate in general

* fixed size configuration

* adjust config

* add group config

* adjust mmlu to use group_config

* fixed args input in aggregate_subtask_metrics

* fixed issues related to printing alias of group and updated yaml

* update all mmlu variants to include group_config

* edit format

* modify mmlu tasks

* adjust group to also be a configurable group

* add configurable group

* simplify get_task_list

* adjust group scoring with using ConfigurableGroup

* adjust args

* update mmlu

* update mmlu

* update to work with new group and task configuration

* readd group_agg

* readd files

* move prepare_print_tasks to evaluator_utils

* sort set to False by default, fix predict_only arg

* add version for groups

* reversed task list

* update additional condition when loading a group in a group yaml

* update truthfulqa

* add description regarding tags replacing group

* replace group to tag

* fixed conditional statement

* remove warning

* update loading of task group and newly added tags

* reformat with pre-commit

* fixed info log

* update

* fix bug

* fix bug

* use task id to differentiate tasks

* convert all groups to configurable groups

* use task_id

* reformat

* add task_id for python tasks as well

* add task_id for python tasks as well

* add task_id for python tasks as well

* revert truthfulqa

* revert mmlu tasks

* new mmlu config

* new group config parameter `tag_to_task`

* Update truthfulqa_mc2.yaml

* reformate

* add _process_group_config

* adjust task_id

* add get_subtask_list function to get proper subtask list

* group config to_dict update

* remove tag check

* update mmlu

* fix config passing issues

* add test yaml

* format fix

* add documentation

* corner case for single tag being called

* fix indentation

* formatting

* update all mmlu variants

* Update docs/task_guide.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* remove group_alias

* Update docs/task_guide.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* remove version for metadata

* Update docs/task_guide.md

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* update mmlu/

* removed " " in make_table

* change how aggregate_metric is loaded

* change how aggregate_metric is loaded

* update aggregate_metric arg

* update format

* update format

* some docs fixes

* add groups for agieval, aexams, aclue

* add more explicit aggregation groups

* add more groupings / tags distinctions

* add more groupings

* more groupings

* add many explicit group configs

* add many explicit group configs

* add more explicit group configs

* add more explicit group configs

* add more error msgs, agg_metric -> agg_metric_list

* some docs updates

* update task_id to be updateable and uses group:task format

* make KMMLU a tag for now

* update docs

* don't duplicate task names

* fix merge conflicts?

* giving this a try

* clean up diff

* switch mmlu variants over to using

* don't use to-be-deprecated group: config field in overview notebook

* Python tasks which subclass ConfigurableTask now run

* update mmlu

* pre-commit format

* fixed sorting for multi-level printing

* move group api to separate file

* fix bbh aggregation filter usage

* track api/group.py

* adjust group and tags loading

* make explicit group configs for leaderboard and other newer tasks

* fix arabicmmlu

* update

* change arabicmmlu template name???

* update group alias

* fix printing bugs

* check table printing is correct ; update tests

* use mmlu_stem to have a group included in print tests

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
---
 docs/new_task_guide.md                        |  92 +++--
 docs/task_guide.md                            | 114 +-----
 examples/lm-eval-overview.ipynb               |   5 +-
 lm_eval/api/group.py                          | 117 ++++++
 lm_eval/api/task.py                           |  21 +-
 lm_eval/evaluator.py                          | 226 +++++------
 lm_eval/evaluator_utils.py                    | 350 ++++++++++++++----
 lm_eval/tasks/__init__.py                     | 245 ++++++++----
 lm_eval/tasks/aclue/README.md                 |   2 +-
 lm_eval/tasks/aclue/_aclue.yaml               |  26 ++
 lm_eval/tasks/aclue/_default_template_yaml    |   1 -
 lm_eval/tasks/aexams/README.md                |   4 +-
 lm_eval/tasks/aexams/_aexams.yaml             |  16 +
 lm_eval/tasks/aexams/_default_template_yaml   |   1 -
 lm_eval/tasks/agieval/README.md               |   6 +-
 lm_eval/tasks/agieval/agieval.yaml            |  29 ++
 lm_eval/tasks/agieval/agieval_cn.yaml         |  19 +
 lm_eval/tasks/agieval/agieval_en.yaml         |  18 +
 lm_eval/tasks/agieval/agieval_nous.yaml       |  16 +
 lm_eval/tasks/agieval/aqua-rat.yaml           |   4 -
 lm_eval/tasks/agieval/gaokao-biology.yaml     |   3 -
 lm_eval/tasks/agieval/gaokao-chemistry.yaml   |   3 -
 lm_eval/tasks/agieval/gaokao-chinese.yaml     |   3 -
 lm_eval/tasks/agieval/gaokao-english.yaml     |   3 -
 lm_eval/tasks/agieval/gaokao-geography.yaml   |   3 -
 lm_eval/tasks/agieval/gaokao-history.yaml     |   3 -
 lm_eval/tasks/agieval/gaokao-mathcloze.yaml   |   3 -
 lm_eval/tasks/agieval/gaokao-mathqa.yaml      |   3 -
 lm_eval/tasks/agieval/gaokao-physics.yaml     |   3 -
 lm_eval/tasks/agieval/jec-qa-ca.yaml          |   3 -
 lm_eval/tasks/agieval/jec-qa-kd.yaml          |   3 -
 lm_eval/tasks/agieval/logiqa-en.yaml          |   4 -
 lm_eval/tasks/agieval/logiqa-zh.yaml          |   3 -
 lm_eval/tasks/agieval/lsat-ar.yaml            |   4 -
 lm_eval/tasks/agieval/lsat-lr.yaml            |   4 -
 lm_eval/tasks/agieval/lsat-rc.yaml            |   4 -
 lm_eval/tasks/agieval/math.yaml               |   3 -
 .../tasks/agieval/sat-en-without-passage.yaml |   4 -
 lm_eval/tasks/agieval/sat-en.yaml             |   4 -
 lm_eval/tasks/agieval/sat-math.yaml           |   4 -
 lm_eval/tasks/anli/anli_r1.yaml               |   2 +-
 .../{arabicmmlu.yaml => _arabicmmlu.yaml}     |   5 +
 .../arabicmmlu/_arabicmmlu_humanities.yaml    |   9 +
 .../arabicmmlu/_arabicmmlu_language.yaml      |   9 +
 .../tasks/arabicmmlu/_arabicmmlu_other.yaml   |   9 +
 .../_arabicmmlu_social_science.yaml           |   9 +
 .../tasks/arabicmmlu/_arabicmmlu_stem.yaml    |   9 +
 ...yaml => _default_arabicmmlu_template_yaml} |   2 +
 lm_eval/tasks/arabicmmlu/_generate_configs.py |   5 +-
 .../arabicmmlu_arabic_language_general.yaml   |   5 +-
 .../arabicmmlu_arabic_language_grammar.yaml   |   5 +-
 .../arabicmmlu/arabicmmlu_driving_test.yaml   |   5 +-
 .../arabicmmlu_general_knowledge.yaml         |   5 +-
 .../arabicmmlu_high_arabic_language.yaml      |   5 +-
 .../arabicmmlu/arabicmmlu_high_biology.yaml   |   5 +-
 .../arabicmmlu/arabicmmlu_high_civics.yaml    |   5 +-
 .../arabicmmlu_high_computer_science.yaml     |   5 +-
 .../arabicmmlu/arabicmmlu_high_economics.yaml |   5 +-
 .../arabicmmlu/arabicmmlu_high_geography.yaml |   5 +-
 .../arabicmmlu/arabicmmlu_high_history.yaml   |   5 +-
 .../arabicmmlu_high_islamic_studies.yaml      |   5 +-
 .../arabicmmlu_high_philosophy.yaml           |   5 +-
 .../arabicmmlu/arabicmmlu_high_physics.yaml   |   5 +-
 .../arabicmmlu_islamic_studies.yaml           |   5 +-
 .../arabicmmlu_middle_arabic_language.yaml    |   5 +-
 .../arabicmmlu/arabicmmlu_middle_civics.yaml  |   5 +-
 .../arabicmmlu_middle_computer_science.yaml   |   5 +-
 .../arabicmmlu_middle_economics.yaml          |   5 +-
 .../arabicmmlu_middle_general_knowledge.yaml  |   5 +-
 .../arabicmmlu_middle_geography.yaml          |   5 +-
 .../arabicmmlu/arabicmmlu_middle_history.yaml |   5 +-
 .../arabicmmlu_middle_islamic_studies.yaml    |   5 +-
 .../arabicmmlu_middle_natural_science.yaml    |   5 +-
 .../arabicmmlu_middle_social_science.yaml     |   5 +-
 .../arabicmmlu_primary_arabic_language.yaml   |   5 +-
 .../arabicmmlu_primary_computer_science.yaml  |   5 +-
 .../arabicmmlu_primary_general_knowledge.yaml |   5 +-
 .../arabicmmlu_primary_geography.yaml         |   5 +-
 .../arabicmmlu_primary_history.yaml           |   5 +-
 .../arabicmmlu_primary_islamic_studies.yaml   |   5 +-
 .../arabicmmlu/arabicmmlu_primary_math.yaml   |   5 +-
 .../arabicmmlu_primary_natural_science.yaml   |   5 +-
 .../arabicmmlu_primary_social_science.yaml    |   5 +-
 .../tasks/arabicmmlu/arabicmmlu_prof_law.yaml |   5 +-
 .../arabicmmlu_univ_accounting.yaml           |   5 +-
 .../arabicmmlu_univ_computer_science.yaml     |   5 +-
 .../arabicmmlu/arabicmmlu_univ_economics.yaml |   5 +-
 .../arabicmmlu_univ_management.yaml           |   5 +-
 .../arabicmmlu_univ_political_science.yaml    |   5 +-
 lm_eval/tasks/arc/README.md                   |   6 +-
 lm_eval/tasks/arc/arc_easy.yaml               |   2 +-
 lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml |   2 +-
 lm_eval/tasks/arithmetic/README.md            |   4 +-
 lm_eval/tasks/arithmetic/arithmetic_1dc.yaml  |   2 +-
 lm_eval/tasks/asdiv/README.md                 |   2 +-
 lm_eval/tasks/babi/README.md                  |   6 +-
 lm_eval/tasks/basqueglue/README.md            |   8 +-
 lm_eval/tasks/basqueglue/bec.yaml             |   2 +-
 lm_eval/tasks/basqueglue/bhtc.yaml            |   2 +-
 lm_eval/tasks/basqueglue/coref.yaml           |   2 +-
 lm_eval/tasks/basqueglue/qnli.yaml            |   2 +-
 lm_eval/tasks/basqueglue/vaxx.yaml            |   2 +-
 lm_eval/tasks/basqueglue/wic.yaml             |   2 +-
 lm_eval/tasks/bbh/README.md                   |   6 +-
 lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml       |  36 ++
 .../bbh/cot_fewshot/_bbh_cot_fewshot.yaml     |  36 ++
 .../cot_fewshot/_cot_fewshot_template_yaml    |   3 -
 .../bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml   |  36 ++
 .../cot_zeroshot/_cot_zeroshot_template_yaml  |   1 -
 lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml   |  35 ++
 .../tasks/bbh/fewshot/_fewshot_template_yaml  |   1 -
 lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml |  36 ++
 .../bbh/zeroshot/_zeroshot_template_yaml      |   1 -
 lm_eval/tasks/belebele/_belebele.yaml         | 133 +++++++
 lm_eval/tasks/belebele/_default_template_yaml |   1 -
 lm_eval/tasks/belebele/_generate_configs.py   |  33 ++
 .../tasks/benchmarks/flan/flan_held_in.yaml   | 141 ++++---
 lm_eval/tasks/benchmarks/minerva_math.yaml    |   6 +
 .../benchmarks/multimedqa/multimedqa.yaml     |   4 +
 lm_eval/tasks/bertaqa/_bertaqa_template       |   2 +-
 lm_eval/tasks/blimp/_blimp.yaml               |  75 ++++
 lm_eval/tasks/blimp/_template_yaml            |   1 -
 lm_eval/tasks/ceval/_ceval-valid.yaml         |  63 ++++
 lm_eval/tasks/ceval/_default_ceval_yaml       |   1 -
 lm_eval/tasks/ceval/_generate_configs.py      |  25 +-
 lm_eval/tasks/cmmlu/_cmmlu.yaml               |  78 ++++
 lm_eval/tasks/cmmlu/_default_template_yaml    |   1 -
 lm_eval/tasks/cmmlu/_generate_configs.py      |  30 ++
 lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml       |   4 +
 lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml        |   4 +
 .../tasks/cmmlu/cmmlu_ancient_chinese.yaml    |   4 +
 lm_eval/tasks/cmmlu/cmmlu_arts.yaml           |   4 +
 lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml      |   4 +
 .../tasks/cmmlu/cmmlu_business_ethics.yaml    |   4 +
 .../cmmlu_chinese_civil_service_exam.yaml     |   4 +
 .../cmmlu/cmmlu_chinese_driving_rule.yaml     |   4 +
 .../cmmlu/cmmlu_chinese_food_culture.yaml     |   4 +
 .../cmmlu/cmmlu_chinese_foreign_policy.yaml   |   4 +
 .../tasks/cmmlu/cmmlu_chinese_history.yaml    |   4 +
 .../tasks/cmmlu/cmmlu_chinese_literature.yaml |   4 +
 .../cmmlu_chinese_teacher_qualification.yaml  |   4 +
 .../tasks/cmmlu/cmmlu_clinical_knowledge.yaml |   4 +
 .../cmmlu_college_actuarial_science.yaml      |   4 +
 .../tasks/cmmlu/cmmlu_college_education.yaml  |   4 +
 .../cmmlu_college_engineering_hydrology.yaml  |   4 +
 lm_eval/tasks/cmmlu/cmmlu_college_law.yaml    |   4 +
 .../cmmlu/cmmlu_college_mathematics.yaml      |   4 +
 .../cmmlu_college_medical_statistics.yaml     |   4 +
 .../tasks/cmmlu/cmmlu_college_medicine.yaml   |   4 +
 .../tasks/cmmlu/cmmlu_computer_science.yaml   |   4 +
 .../tasks/cmmlu/cmmlu_computer_security.yaml  |   4 +
 .../tasks/cmmlu/cmmlu_conceptual_physics.yaml |   4 +
 ...cmmlu_construction_project_management.yaml |   4 +
 lm_eval/tasks/cmmlu/cmmlu_economics.yaml      |   4 +
 lm_eval/tasks/cmmlu/cmmlu_education.yaml      |   4 +
 .../cmmlu/cmmlu_electrical_engineering.yaml   |   4 +
 .../tasks/cmmlu/cmmlu_elementary_chinese.yaml |   4 +
 .../cmmlu/cmmlu_elementary_commonsense.yaml   |   4 +
 ...elementary_information_and_technology.yaml |   4 +
 .../cmmlu/cmmlu_elementary_mathematics.yaml   |   4 +
 lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml      |   4 +
 lm_eval/tasks/cmmlu/cmmlu_food_science.yaml   |   4 +
 lm_eval/tasks/cmmlu/cmmlu_genetics.yaml       |   4 +
 lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml   |   4 +
 .../cmmlu/cmmlu_high_school_biology.yaml      |   4 +
 .../cmmlu/cmmlu_high_school_chemistry.yaml    |   4 +
 .../cmmlu/cmmlu_high_school_geography.yaml    |   4 +
 .../cmmlu/cmmlu_high_school_mathematics.yaml  |   4 +
 .../cmmlu/cmmlu_high_school_physics.yaml      |   4 +
 .../cmmlu/cmmlu_high_school_politics.yaml     |   4 +
 .../tasks/cmmlu/cmmlu_human_sexuality.yaml    |   4 +
 .../tasks/cmmlu/cmmlu_international_law.yaml  |   4 +
 lm_eval/tasks/cmmlu/cmmlu_journalism.yaml     |   4 +
 lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml  |   4 +
 .../cmmlu/cmmlu_legal_and_moral_basis.yaml    |   4 +
 lm_eval/tasks/cmmlu/cmmlu_logical.yaml        |   4 +
 .../tasks/cmmlu/cmmlu_machine_learning.yaml   |   4 +
 lm_eval/tasks/cmmlu/cmmlu_management.yaml     |   4 +
 lm_eval/tasks/cmmlu/cmmlu_marketing.yaml      |   4 +
 lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml |   4 +
 lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml |   4 +
 lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml      |   4 +
 lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml     |   4 +
 .../cmmlu/cmmlu_professional_accounting.yaml  |   4 +
 .../tasks/cmmlu/cmmlu_professional_law.yaml   |   4 +
 .../cmmlu/cmmlu_professional_medicine.yaml    |   4 +
 .../cmmlu/cmmlu_professional_psychology.yaml  |   4 +
 .../tasks/cmmlu/cmmlu_public_relations.yaml   |   4 +
 lm_eval/tasks/cmmlu/cmmlu_security_study.yaml |   4 +
 lm_eval/tasks/cmmlu/cmmlu_sociology.yaml      |   4 +
 lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml |   4 +
 .../cmmlu_traditional_chinese_medicine.yaml   |   4 +
 lm_eval/tasks/cmmlu/cmmlu_virology.yaml       |   4 +
 lm_eval/tasks/cmmlu/cmmlu_world_history.yaml  |   4 +
 .../tasks/cmmlu/cmmlu_world_religions.yaml    |   4 +
 lm_eval/tasks/copal_id/standard.yaml          |   2 +-
 .../crows_pairs/crows_pairs_english.yaml      |   4 +-
 lm_eval/tasks/csatqa/_csatqa.yaml             |  17 +
 lm_eval/tasks/csatqa/_default_csatqa_yaml     |   1 -
 lm_eval/tasks/fda/task.py                     |   2 +-
 lm_eval/tasks/fld/fld_default.yaml            |   2 -
 lm_eval/tasks/french_bench/README.md          |   4 +-
 .../french_bench_arc_challenge.yaml           |   2 +-
 .../french_bench/french_bench_boolqa.yaml     |   2 +-
 .../french_bench/french_bench_fquadv2.yaml    |   2 +-
 .../french_bench_fquadv2_bool.yaml            |   2 +-
 .../french_bench_fquadv2_genq.yaml            |   2 +-
 .../french_bench_fquadv2_hasAns.yaml          |   2 +-
 .../french_bench/french_bench_grammar.yaml    |   2 +-
 .../french_bench/french_bench_hellaswag.yaml  |   2 +-
 .../french_bench/french_bench_multifquad.yaml |   2 +-
 .../french_bench_opus_perplexity.yaml         |   2 +-
 .../french_bench_orangesum_abstract.yaml      |   2 +-
 .../french_bench_orangesum_title.yaml         |   2 +-
 .../french_bench_reading_comp.yaml            |   2 +-
 .../french_bench_topic_based_nli.yaml         |   2 +-
 .../french_bench/french_bench_trivia.yaml     |   2 +-
 .../french_bench/french_bench_vocab.yaml      |   2 +-
 .../french_bench_wikitext_fr.yaml             |   2 +-
 .../tasks/french_bench/french_bench_xnli.yaml |   2 +-
 lm_eval/tasks/glue/README.md                  |   6 +-
 lm_eval/tasks/glue/cola/default.yaml          |   2 +-
 lm_eval/tasks/glue/mnli/default.yaml          |   2 +-
 lm_eval/tasks/glue/mrpc/default.yaml          |   2 +-
 lm_eval/tasks/glue/qnli/default.yaml          |   2 +-
 lm_eval/tasks/glue/qqp/default.yaml           |   2 +-
 lm_eval/tasks/glue/rte/default.yaml           |   2 +-
 lm_eval/tasks/glue/sst2/default.yaml          |   2 +-
 lm_eval/tasks/glue/wnli/default.yaml          |   2 +-
 lm_eval/tasks/gpqa/README.md                  |   8 +-
 .../gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml     |   2 +-
 .../gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml |   2 +-
 .../generative/_gpqa_generative_n_shot_yaml   |   2 +-
 lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml   |   2 +-
 .../tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml   |   2 +-
 .../gsm8k/gsm8k-cot-self-consistency.yaml     |   2 +-
 lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml   |   2 +-
 lm_eval/tasks/gsm8k/gsm8k-cot.yaml            |   2 +-
 lm_eval/tasks/gsm8k/gsm8k.yaml                |   2 +-
 lm_eval/tasks/haerae/_default_haerae_yaml     |   1 -
 lm_eval/tasks/haerae/_haerae.yaml             |  16 +
 lm_eval/tasks/headqa/headqa_en.yaml           |   3 +-
 lm_eval/tasks/hellaswag/hellaswag.yaml        |   2 +-
 .../tasks/hendrycks_ethics/commonsense.yaml   |   2 +-
 lm_eval/tasks/hendrycks_ethics/justice.yaml   |   2 +-
 .../hendrycks_ethics/utilitarianism.yaml      |   2 +-
 lm_eval/tasks/hendrycks_ethics/virtue.yaml    |   2 +-
 .../tasks/hendrycks_math/hendrycks_math.yaml  |   6 +
 .../hendrycks_math_algebra.yaml               |   2 +-
 .../inverse_scaling/_inverse_scaling_mc_yaml  |   2 +-
 lm_eval/tasks/inverse_scaling/_some_results   |  39 ++
 lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml  |   2 +-
 lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml |   2 +-
 .../kmmlu/direct_hard/_direct_hard_kmmlu_yaml |   2 +-
 lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml     |   2 +-
 lm_eval/tasks/kobest/kobest_boolq.yaml        |   2 +-
 lm_eval/tasks/kobest/kobest_copa.yaml         |   2 +-
 lm_eval/tasks/kobest/kobest_hellaswag.yaml    |   2 +-
 lm_eval/tasks/kobest/kobest_sentineg.yaml     |   2 +-
 lm_eval/tasks/kobest/kobest_wic.yaml          |   2 +-
 lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml     |  11 +
 .../tasks/kormedmcqa/kormedmcqa_doctor.yaml   |   1 -
 .../tasks/kormedmcqa/kormedmcqa_nurse.yaml    |   1 -
 .../tasks/kormedmcqa/kormedmcqa_pharm.yaml    |   1 -
 lm_eval/tasks/lambada/lambada_openai.yaml     |   2 +-
 lm_eval/tasks/lambada/lambada_standard.yaml   |   2 +-
 .../lambada_cloze/lambada_openai_cloze.yaml   |   2 +-
 .../lambada_cloze/lambada_standard_cloze.yaml |   2 +-
 .../lambada_multilingual/lambada_mt_en.yaml   |   2 +-
 .../leaderboard/bbh_mc/_fewshot_template_yaml |   1 -
 .../leaderboard/bbh_mc/_leaderboard_bbh.yaml  |  26 ++
 .../leaderboard/gpqa/_leaderboard_gpqa.yaml   |   5 +
 lm_eval/tasks/leaderboard/gpqa/_template_yaml |   1 -
 .../_leaderboard_instruction_following.yaml   |   3 +
 lm_eval/tasks/leaderboard/ifeval/ifeval.yaml  |   1 -
 .../leaderboard/math/_leaderboard_math.yaml   |   9 +
 lm_eval/tasks/leaderboard/math/_template_yaml |   2 -
 lm_eval/tasks/leaderboard/musr/_musr.yaml     |   5 +
 lm_eval/tasks/leaderboard/musr/_template_yaml |   2 -
 lm_eval/tasks/mathqa/mathqa.yaml              |   2 +-
 lm_eval/tasks/mmlu/_generate_configs.py       |   3 +-
 lm_eval/tasks/mmlu/continuation/_mmlu.yaml    |  34 +-
 .../continuation/mmlu_abstract_algebra.yaml   |   2 +-
 .../tasks/mmlu/continuation/mmlu_anatomy.yaml |   2 +-
 .../mmlu/continuation/mmlu_astronomy.yaml     |   2 +-
 .../continuation/mmlu_business_ethics.yaml    |   2 +-
 .../continuation/mmlu_clinical_knowledge.yaml |   2 +-
 .../continuation/mmlu_college_biology.yaml    |   2 +-
 .../continuation/mmlu_college_chemistry.yaml  |   2 +-
 .../mmlu_college_computer_science.yaml        |   2 +-
 .../mmlu_college_mathematics.yaml             |   2 +-
 .../continuation/mmlu_college_medicine.yaml   |   2 +-
 .../continuation/mmlu_college_physics.yaml    |   2 +-
 .../continuation/mmlu_computer_security.yaml  |   2 +-
 .../continuation/mmlu_conceptual_physics.yaml |   2 +-
 .../mmlu/continuation/mmlu_econometrics.yaml  |   2 +-
 .../mmlu_electrical_engineering.yaml          |   2 +-
 .../mmlu_elementary_mathematics.yaml          |   2 +-
 .../mmlu/continuation/mmlu_formal_logic.yaml  |   2 +-
 .../mmlu/continuation/mmlu_global_facts.yaml  |   2 +-
 .../mmlu_high_school_biology.yaml             |   2 +-
 .../mmlu_high_school_chemistry.yaml           |   2 +-
 .../mmlu_high_school_computer_science.yaml    |   2 +-
 .../mmlu_high_school_european_history.yaml    |   2 +-
 .../mmlu_high_school_geography.yaml           |   2 +-
 ...u_high_school_government_and_politics.yaml |   2 +-
 .../mmlu_high_school_macroeconomics.yaml      |   2 +-
 .../mmlu_high_school_mathematics.yaml         |   2 +-
 .../mmlu_high_school_microeconomics.yaml      |   2 +-
 .../mmlu_high_school_physics.yaml             |   2 +-
 .../mmlu_high_school_psychology.yaml          |   2 +-
 .../mmlu_high_school_statistics.yaml          |   2 +-
 .../mmlu_high_school_us_history.yaml          |   2 +-
 .../mmlu_high_school_world_history.yaml       |   2 +-
 .../mmlu/continuation/mmlu_human_aging.yaml   |   2 +-
 .../continuation/mmlu_human_sexuality.yaml    |   2 +-
 .../continuation/mmlu_international_law.yaml  |   2 +-
 .../mmlu/continuation/mmlu_jurisprudence.yaml |   2 +-
 .../continuation/mmlu_logical_fallacies.yaml  |   2 +-
 .../continuation/mmlu_machine_learning.yaml   |   2 +-
 .../mmlu/continuation/mmlu_management.yaml    |   2 +-
 .../mmlu/continuation/mmlu_marketing.yaml     |   2 +-
 .../continuation/mmlu_medical_genetics.yaml   |   2 +-
 .../mmlu/continuation/mmlu_miscellaneous.yaml |   2 +-
 .../continuation/mmlu_moral_disputes.yaml     |   2 +-
 .../continuation/mmlu_moral_scenarios.yaml    |   2 +-
 .../mmlu/continuation/mmlu_nutrition.yaml     |   2 +-
 .../mmlu/continuation/mmlu_philosophy.yaml    |   2 +-
 .../mmlu/continuation/mmlu_prehistory.yaml    |   2 +-
 .../mmlu_professional_accounting.yaml         |   2 +-
 .../continuation/mmlu_professional_law.yaml   |   2 +-
 .../mmlu_professional_medicine.yaml           |   2 +-
 .../mmlu_professional_psychology.yaml         |   2 +-
 .../continuation/mmlu_public_relations.yaml   |   2 +-
 .../continuation/mmlu_security_studies.yaml   |   2 +-
 .../mmlu/continuation/mmlu_sociology.yaml     |   2 +-
 .../continuation/mmlu_us_foreign_policy.yaml  |   2 +-
 .../mmlu/continuation/mmlu_virology.yaml      |   2 +-
 .../continuation/mmlu_world_religions.yaml    |   2 +-
 lm_eval/tasks/mmlu/default/_mmlu.yaml         |   5 +
 .../tasks/mmlu/default/_mmlu_humanities.yaml  |   9 +
 lm_eval/tasks/mmlu/default/_mmlu_other.yaml   |   9 +
 .../mmlu/default/_mmlu_social_sciences.yaml   |   9 +
 lm_eval/tasks/mmlu/default/_mmlu_stem.yaml    |   9 +
 .../mmlu/default/mmlu_abstract_algebra.yaml   |   3 +-
 lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml  |   3 +-
 .../tasks/mmlu/default/mmlu_astronomy.yaml    |   3 +-
 .../mmlu/default/mmlu_business_ethics.yaml    |   3 +-
 .../mmlu/default/mmlu_clinical_knowledge.yaml |   3 +-
 .../mmlu/default/mmlu_college_biology.yaml    |   3 +-
 .../mmlu/default/mmlu_college_chemistry.yaml  |   3 +-
 .../mmlu_college_computer_science.yaml        |   3 +-
 .../default/mmlu_college_mathematics.yaml     |   3 +-
 .../mmlu/default/mmlu_college_medicine.yaml   |   3 +-
 .../mmlu/default/mmlu_college_physics.yaml    |   3 +-
 .../mmlu/default/mmlu_computer_security.yaml  |   3 +-
 .../mmlu/default/mmlu_conceptual_physics.yaml |   3 +-
 .../tasks/mmlu/default/mmlu_econometrics.yaml |   3 +-
 .../default/mmlu_electrical_engineering.yaml  |   3 +-
 .../default/mmlu_elementary_mathematics.yaml  |   3 +-
 .../tasks/mmlu/default/mmlu_formal_logic.yaml |   3 +-
 .../tasks/mmlu/default/mmlu_global_facts.yaml |   3 +-
 .../default/mmlu_high_school_biology.yaml     |   3 +-
 .../default/mmlu_high_school_chemistry.yaml   |   3 +-
 .../mmlu_high_school_computer_science.yaml    |   3 +-
 .../mmlu_high_school_european_history.yaml    |   3 +-
 .../default/mmlu_high_school_geography.yaml   |   3 +-
 ...u_high_school_government_and_politics.yaml |   3 +-
 .../mmlu_high_school_macroeconomics.yaml      |   3 +-
 .../default/mmlu_high_school_mathematics.yaml |   3 +-
 .../mmlu_high_school_microeconomics.yaml      |   3 +-
 .../default/mmlu_high_school_physics.yaml     |   3 +-
 .../default/mmlu_high_school_psychology.yaml  |   3 +-
 .../default/mmlu_high_school_statistics.yaml  |   3 +-
 .../default/mmlu_high_school_us_history.yaml  |   3 +-
 .../mmlu_high_school_world_history.yaml       |   3 +-
 .../tasks/mmlu/default/mmlu_human_aging.yaml  |   3 +-
 .../mmlu/default/mmlu_human_sexuality.yaml    |   3 +-
 .../mmlu/default/mmlu_international_law.yaml  |   3 +-
 .../mmlu/default/mmlu_jurisprudence.yaml      |   3 +-
 .../mmlu/default/mmlu_logical_fallacies.yaml  |   3 +-
 .../mmlu/default/mmlu_machine_learning.yaml   |   3 +-
 .../tasks/mmlu/default/mmlu_management.yaml   |   3 +-
 .../tasks/mmlu/default/mmlu_marketing.yaml    |   3 +-
 .../mmlu/default/mmlu_medical_genetics.yaml   |   3 +-
 .../mmlu/default/mmlu_miscellaneous.yaml      |   3 +-
 .../mmlu/default/mmlu_moral_disputes.yaml     |   3 +-
 .../mmlu/default/mmlu_moral_scenarios.yaml    |   3 +-
 .../tasks/mmlu/default/mmlu_nutrition.yaml    |   3 +-
 .../tasks/mmlu/default/mmlu_philosophy.yaml   |   3 +-
 .../tasks/mmlu/default/mmlu_prehistory.yaml   |   3 +-
 .../default/mmlu_professional_accounting.yaml |   3 +-
 .../mmlu/default/mmlu_professional_law.yaml   |   3 +-
 .../default/mmlu_professional_medicine.yaml   |   3 +-
 .../default/mmlu_professional_psychology.yaml |   3 +-
 .../mmlu/default/mmlu_public_relations.yaml   |   3 +-
 .../mmlu/default/mmlu_security_studies.yaml   |   3 +-
 .../tasks/mmlu/default/mmlu_sociology.yaml    |   3 +-
 .../mmlu/default/mmlu_us_foreign_policy.yaml  |   3 +-
 lm_eval/tasks/mmlu/default/mmlu_virology.yaml |   3 +-
 .../mmlu/default/mmlu_world_religions.yaml    |   3 +-
 .../tasks/mmlu/flan_cot_fewshot/_mmlu.yaml    |  34 +-
 .../mmlu_abstract_algebra.yaml                |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_anatomy.yaml   |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_astronomy.yaml |   2 +-
 .../mmlu_business_ethics.yaml                 |   2 +-
 .../mmlu_clinical_knowledge.yaml              |   2 +-
 .../mmlu_college_biology.yaml                 |   2 +-
 .../mmlu_college_chemistry.yaml               |   2 +-
 .../mmlu_college_computer_science.yaml        |   2 +-
 .../mmlu_college_mathematics.yaml             |   2 +-
 .../mmlu_college_medicine.yaml                |   2 +-
 .../mmlu_college_physics.yaml                 |   2 +-
 .../mmlu_computer_security.yaml               |   2 +-
 .../mmlu_conceptual_physics.yaml              |   2 +-
 .../flan_cot_fewshot/mmlu_econometrics.yaml   |   2 +-
 .../mmlu_electrical_engineering.yaml          |   2 +-
 .../mmlu_elementary_mathematics.yaml          |   2 +-
 .../flan_cot_fewshot/mmlu_formal_logic.yaml   |   2 +-
 .../flan_cot_fewshot/mmlu_global_facts.yaml   |   2 +-
 .../mmlu_high_school_biology.yaml             |   2 +-
 .../mmlu_high_school_chemistry.yaml           |   2 +-
 .../mmlu_high_school_computer_science.yaml    |   2 +-
 .../mmlu_high_school_european_history.yaml    |   2 +-
 .../mmlu_high_school_geography.yaml           |   2 +-
 ...u_high_school_government_and_politics.yaml |   2 +-
 .../mmlu_high_school_macroeconomics.yaml      |   2 +-
 .../mmlu_high_school_mathematics.yaml         |   2 +-
 .../mmlu_high_school_microeconomics.yaml      |   2 +-
 .../mmlu_high_school_physics.yaml             |   2 +-
 .../mmlu_high_school_psychology.yaml          |   2 +-
 .../mmlu_high_school_statistics.yaml          |   2 +-
 .../mmlu_high_school_us_history.yaml          |   2 +-
 .../mmlu_high_school_world_history.yaml       |   2 +-
 .../flan_cot_fewshot/mmlu_human_aging.yaml    |   2 +-
 .../mmlu_human_sexuality.yaml                 |   2 +-
 .../mmlu_international_law.yaml               |   2 +-
 .../flan_cot_fewshot/mmlu_jurisprudence.yaml  |   2 +-
 .../mmlu_logical_fallacies.yaml               |   2 +-
 .../mmlu_machine_learning.yaml                |   2 +-
 .../flan_cot_fewshot/mmlu_management.yaml     |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_marketing.yaml |   2 +-
 .../mmlu_medical_genetics.yaml                |   2 +-
 .../flan_cot_fewshot/mmlu_miscellaneous.yaml  |   2 +-
 .../flan_cot_fewshot/mmlu_moral_disputes.yaml |   2 +-
 .../mmlu_moral_scenarios.yaml                 |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_nutrition.yaml |   2 +-
 .../flan_cot_fewshot/mmlu_philosophy.yaml     |   2 +-
 .../flan_cot_fewshot/mmlu_prehistory.yaml     |   2 +-
 .../mmlu_professional_accounting.yaml         |   2 +-
 .../mmlu_professional_law.yaml                |   2 +-
 .../mmlu_professional_medicine.yaml           |   2 +-
 .../mmlu_professional_psychology.yaml         |   2 +-
 .../mmlu_public_relations.yaml                |   2 +-
 .../mmlu_security_studies.yaml                |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_sociology.yaml |   2 +-
 .../mmlu_us_foreign_policy.yaml               |   2 +-
 .../mmlu/flan_cot_fewshot/mmlu_virology.yaml  |   2 +-
 .../mmlu_world_religions.yaml                 |   2 +-
 .../tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml   |  34 +-
 .../mmlu_abstract_algebra.yaml                |   2 +-
 .../mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml  |   2 +-
 .../flan_cot_zeroshot/mmlu_astronomy.yaml     |   2 +-
 .../mmlu_business_ethics.yaml                 |   2 +-
 .../mmlu_clinical_knowledge.yaml              |   2 +-
 .../mmlu_college_biology.yaml                 |   2 +-
 .../mmlu_college_chemistry.yaml               |   2 +-
 .../mmlu_college_computer_science.yaml        |   2 +-
 .../mmlu_college_mathematics.yaml             |   2 +-
 .../mmlu_college_medicine.yaml                |   2 +-
 .../mmlu_college_physics.yaml                 |   2 +-
 .../mmlu_computer_security.yaml               |   2 +-
 .../mmlu_conceptual_physics.yaml              |   2 +-
 .../flan_cot_zeroshot/mmlu_econometrics.yaml  |   2 +-
 .../mmlu_electrical_engineering.yaml          |   2 +-
 .../mmlu_elementary_mathematics.yaml          |   2 +-
 .../flan_cot_zeroshot/mmlu_formal_logic.yaml  |   2 +-
 .../flan_cot_zeroshot/mmlu_global_facts.yaml  |   2 +-
 .../mmlu_high_school_biology.yaml             |   2 +-
 .../mmlu_high_school_chemistry.yaml           |   2 +-
 .../mmlu_high_school_computer_science.yaml    |   2 +-
 .../mmlu_high_school_european_history.yaml    |   2 +-
 .../mmlu_high_school_geography.yaml           |   2 +-
 ...u_high_school_government_and_politics.yaml |   2 +-
 .../mmlu_high_school_macroeconomics.yaml      |   2 +-
 .../mmlu_high_school_mathematics.yaml         |   2 +-
 .../mmlu_high_school_microeconomics.yaml      |   2 +-
 .../mmlu_high_school_physics.yaml             |   2 +-
 .../mmlu_high_school_psychology.yaml          |   2 +-
 .../mmlu_high_school_statistics.yaml          |   2 +-
 .../mmlu_high_school_us_history.yaml          |   2 +-
 .../mmlu_high_school_world_history.yaml       |   2 +-
 .../flan_cot_zeroshot/mmlu_human_aging.yaml   |   2 +-
 .../mmlu_human_sexuality.yaml                 |   2 +-
 .../mmlu_international_law.yaml               |   2 +-
 .../flan_cot_zeroshot/mmlu_jurisprudence.yaml |   2 +-
 .../mmlu_logical_fallacies.yaml               |   2 +-
 .../mmlu_machine_learning.yaml                |   2 +-
 .../flan_cot_zeroshot/mmlu_management.yaml    |   2 +-
 .../flan_cot_zeroshot/mmlu_marketing.yaml     |   2 +-
 .../mmlu_medical_genetics.yaml                |   2 +-
 .../flan_cot_zeroshot/mmlu_miscellaneous.yaml |   2 +-
 .../mmlu_moral_disputes.yaml                  |   2 +-
 .../mmlu_moral_scenarios.yaml                 |   2 +-
 .../flan_cot_zeroshot/mmlu_nutrition.yaml     |   2 +-
 .../flan_cot_zeroshot/mmlu_philosophy.yaml    |   2 +-
 .../flan_cot_zeroshot/mmlu_prehistory.yaml    |   2 +-
 .../mmlu_professional_accounting.yaml         |   2 +-
 .../mmlu_professional_law.yaml                |   2 +-
 .../mmlu_professional_medicine.yaml           |   2 +-
 .../mmlu_professional_psychology.yaml         |   2 +-
 .../mmlu_public_relations.yaml                |   2 +-
 .../mmlu_security_studies.yaml                |   2 +-
 .../flan_cot_zeroshot/mmlu_sociology.yaml     |   2 +-
 .../mmlu_us_foreign_policy.yaml               |   2 +-
 .../mmlu/flan_cot_zeroshot/mmlu_virology.yaml |   2 +-
 .../mmlu_world_religions.yaml                 |   2 +-
 .../mmlu/flan_n_shot/generative/_mmlu.yaml    |  34 +-
 .../_mmlu_flan_generative_template_yaml       |   1 -
 .../generative/mmlu_abstract_algebra.yaml     |   2 +-
 .../flan_n_shot/generative/mmlu_anatomy.yaml  |   2 +-
 .../generative/mmlu_astronomy.yaml            |   2 +-
 .../generative/mmlu_business_ethics.yaml      |   2 +-
 .../generative/mmlu_clinical_knowledge.yaml   |   2 +-
 .../generative/mmlu_college_biology.yaml      |   2 +-
 .../generative/mmlu_college_chemistry.yaml    |   2 +-
 .../mmlu_college_computer_science.yaml        |   2 +-
 .../generative/mmlu_college_mathematics.yaml  |   2 +-
 .../generative/mmlu_college_medicine.yaml     |   2 +-
 .../generative/mmlu_college_physics.yaml      |   2 +-
 .../generative/mmlu_computer_security.yaml    |   2 +-
 .../generative/mmlu_conceptual_physics.yaml   |   2 +-
 .../generative/mmlu_econometrics.yaml         |   2 +-
 .../mmlu_electrical_engineering.yaml          |   2 +-
 .../mmlu_elementary_mathematics.yaml          |   2 +-
 .../generative/mmlu_formal_logic.yaml         |   2 +-
 .../generative/mmlu_global_facts.yaml         |   2 +-
 .../generative/mmlu_high_school_biology.yaml  |   2 +-
 .../mmlu_high_school_chemistry.yaml           |   2 +-
 .../mmlu_high_school_computer_science.yaml    |   2 +-
 .../mmlu_high_school_european_history.yaml    |   2 +-
 .../mmlu_high_school_geography.yaml           |   2 +-
 ...u_high_school_government_and_politics.yaml |   2 +-
 .../mmlu_high_school_macroeconomics.yaml      |   2 +-
 .../mmlu_high_school_mathematics.yaml         |   2 +-
 .../mmlu_high_school_microeconomics.yaml      |   2 +-
 .../generative/mmlu_high_school_physics.yaml  |   2 +-
 .../mmlu_high_school_psychology.yaml          |   2 +-
 .../mmlu_high_school_statistics.yaml          |   2 +-
 .../mmlu_high_school_us_history.yaml          |   2 +-
 .../mmlu_high_school_world_history.yaml       |   2 +-
 .../generative/mmlu_human_aging.yaml          |   2 +-
 .../generative/mmlu_human_sexuality.yaml      |   2 +-
 .../generative/mmlu_international_law.yaml    |   2 +-
 .../generative/mmlu_jurisprudence.yaml        |   2 +-
 .../generative/mmlu_logical_fallacies.yaml    |   2 +-
 .../generative/mmlu_machine_learning.yaml     |   2 +-
 .../generative/mmlu_management.yaml           |   2 +-
 .../generative/mmlu_marketing.yaml            |   2 +-
 .../generative/mmlu_medical_genetics.yaml     |   2 +-
 .../generative/mmlu_miscellaneous.yaml        |   2 +-
 .../generative/mmlu_moral_disputes.yaml       |   2 +-
 .../generative/mmlu_moral_scenarios.yaml      |   2 +-
 .../generative/mmlu_nutrition.yaml            |   2 +-
 .../generative/mmlu_philosophy.yaml           |   2 +-
 .../generative/mmlu_prehistory.yaml           |   2 +-
 .../mmlu_professional_accounting.yaml         |   2 +-
 .../generative/mmlu_professional_law.yaml     |   2 +-
 .../mmlu_professional_medicine.yaml           |   2 +-
 .../mmlu_professional_psychology.yaml         |   2 +-
 .../generative/mmlu_public_relations.yaml     |   2 +-
 .../generative/mmlu_security_studies.yaml     |   2 +-
 .../generative/mmlu_sociology.yaml            |   2 +-
 .../generative/mmlu_us_foreign_policy.yaml    |   2 +-
 .../flan_n_shot/generative/mmlu_virology.yaml |   2 +-
 .../generative/mmlu_world_religions.yaml      |   2 +-
 .../mmlu/flan_n_shot/loglikelihood/_mmlu.yaml |  34 +-
 .../loglikelihood/mmlu_abstract_algebra.yaml  |   2 +-
 .../loglikelihood/mmlu_anatomy.yaml           |   2 +-
 .../loglikelihood/mmlu_astronomy.yaml         |   2 +-
 .../loglikelihood/mmlu_business_ethics.yaml   |   2 +-
 .../mmlu_clinical_knowledge.yaml              |   2 +-
 .../loglikelihood/mmlu_college_biology.yaml   |   2 +-
 .../loglikelihood/mmlu_college_chemistry.yaml |   2 +-
 .../mmlu_college_computer_science.yaml        |   2 +-
 .../mmlu_college_mathematics.yaml             |   2 +-
 .../loglikelihood/mmlu_college_medicine.yaml  |   2 +-
 .../loglikelihood/mmlu_college_physics.yaml   |   2 +-
 .../loglikelihood/mmlu_computer_security.yaml |   2 +-
 .../mmlu_conceptual_physics.yaml              |   2 +-
 .../loglikelihood/mmlu_econometrics.yaml      |   2 +-
 .../mmlu_electrical_engineering.yaml          |   2 +-
 .../mmlu_elementary_mathematics.yaml          |   2 +-
 .../loglikelihood/mmlu_formal_logic.yaml      |   2 +-
 .../loglikelihood/mmlu_global_facts.yaml      |   2 +-
 .../mmlu_high_school_biology.yaml             |   2 +-
 .../mmlu_high_school_chemistry.yaml           |   2 +-
 .../mmlu_high_school_computer_science.yaml    |   2 +-
 .../mmlu_high_school_european_history.yaml    |   2 +-
 .../mmlu_high_school_geography.yaml           |   2 +-
 ...u_high_school_government_and_politics.yaml |   2 +-
 .../mmlu_high_school_macroeconomics.yaml      |   2 +-
 .../mmlu_high_school_mathematics.yaml         |   2 +-
 .../mmlu_high_school_microeconomics.yaml      |   2 +-
 .../mmlu_high_school_physics.yaml             |   2 +-
 .../mmlu_high_school_psychology.yaml          |   2 +-
 .../mmlu_high_school_statistics.yaml          |   2 +-
 .../mmlu_high_school_us_history.yaml          |   2 +-
 .../mmlu_high_school_world_history.yaml       |   2 +-
 .../loglikelihood/mmlu_human_aging.yaml       |   2 +-
 .../loglikelihood/mmlu_human_sexuality.yaml   |   2 +-
 .../loglikelihood/mmlu_international_law.yaml |   2 +-
 .../loglikelihood/mmlu_jurisprudence.yaml     |   2 +-
 .../loglikelihood/mmlu_logical_fallacies.yaml |   2 +-
 .../loglikelihood/mmlu_machine_learning.yaml  |   2 +-
 .../loglikelihood/mmlu_management.yaml        |   2 +-
 .../loglikelihood/mmlu_marketing.yaml         |   2 +-
 .../loglikelihood/mmlu_medical_genetics.yaml  |   2 +-
 .../loglikelihood/mmlu_miscellaneous.yaml     |   2 +-
 .../loglikelihood/mmlu_moral_disputes.yaml    |   2 +-
 .../loglikelihood/mmlu_moral_scenarios.yaml   |   2 +-
 .../loglikelihood/mmlu_nutrition.yaml         |   2 +-
 .../loglikelihood/mmlu_philosophy.yaml        |   2 +-
 .../loglikelihood/mmlu_prehistory.yaml        |   2 +-
 .../mmlu_professional_accounting.yaml         |   2 +-
 .../loglikelihood/mmlu_professional_law.yaml  |   2 +-
 .../mmlu_professional_medicine.yaml           |   2 +-
 .../mmlu_professional_psychology.yaml         |   2 +-
 .../loglikelihood/mmlu_public_relations.yaml  |   2 +-
 .../loglikelihood/mmlu_security_studies.yaml  |   2 +-
 .../loglikelihood/mmlu_sociology.yaml         |   2 +-
 .../loglikelihood/mmlu_us_foreign_policy.yaml |   2 +-
 .../loglikelihood/mmlu_virology.yaml          |   2 +-
 .../loglikelihood/mmlu_world_religions.yaml   |   2 +-
 lm_eval/tasks/mmlu/generative/_mmlu.yaml      |  34 +-
 .../generative/mmlu_abstract_algebra.yaml     |   3 +-
 .../tasks/mmlu/generative/mmlu_anatomy.yaml   |   3 +-
 .../tasks/mmlu/generative/mmlu_astronomy.yaml |   3 +-
 .../mmlu/generative/mmlu_business_ethics.yaml |   3 +-
 .../generative/mmlu_clinical_knowledge.yaml   |   3 +-
 .../mmlu/generative/mmlu_college_biology.yaml |   3 +-
 .../generative/mmlu_college_chemistry.yaml    |   3 +-
 .../mmlu_college_computer_science.yaml        |   3 +-
 .../generative/mmlu_college_mathematics.yaml  |   3 +-
 .../generative/mmlu_college_medicine.yaml     |   3 +-
 .../mmlu/generative/mmlu_college_physics.yaml |   3 +-
 .../generative/mmlu_computer_security.yaml    |   3 +-
 .../generative/mmlu_conceptual_physics.yaml   |   3 +-
 .../mmlu/generative/mmlu_econometrics.yaml    |   3 +-
 .../mmlu_electrical_engineering.yaml          |   3 +-
 .../mmlu_elementary_mathematics.yaml          |   3 +-
 .../mmlu/generative/mmlu_formal_logic.yaml    |   3 +-
 .../mmlu/generative/mmlu_global_facts.yaml    |   3 +-
 .../generative/mmlu_high_school_biology.yaml  |   3 +-
 .../mmlu_high_school_chemistry.yaml           |   3 +-
 .../mmlu_high_school_computer_science.yaml    |   3 +-
 .../mmlu_high_school_european_history.yaml    |   3 +-
 .../mmlu_high_school_geography.yaml           |   3 +-
 ...u_high_school_government_and_politics.yaml |   3 +-
 .../mmlu_high_school_macroeconomics.yaml      |   3 +-
 .../mmlu_high_school_mathematics.yaml         |   3 +-
 .../mmlu_high_school_microeconomics.yaml      |   3 +-
 .../generative/mmlu_high_school_physics.yaml  |   3 +-
 .../mmlu_high_school_psychology.yaml          |   3 +-
 .../mmlu_high_school_statistics.yaml          |   3 +-
 .../mmlu_high_school_us_history.yaml          |   3 +-
 .../mmlu_high_school_world_history.yaml       |   3 +-
 .../mmlu/generative/mmlu_human_aging.yaml     |   3 +-
 .../mmlu/generative/mmlu_human_sexuality.yaml |   3 +-
 .../generative/mmlu_international_law.yaml    |   3 +-
 .../mmlu/generative/mmlu_jurisprudence.yaml   |   3 +-
 .../generative/mmlu_logical_fallacies.yaml    |   3 +-
 .../generative/mmlu_machine_learning.yaml     |   3 +-
 .../mmlu/generative/mmlu_management.yaml      |   3 +-
 .../tasks/mmlu/generative/mmlu_marketing.yaml |   3 +-
 .../generative/mmlu_medical_genetics.yaml     |   3 +-
 .../mmlu/generative/mmlu_miscellaneous.yaml   |   3 +-
 .../mmlu/generative/mmlu_moral_disputes.yaml  |   3 +-
 .../mmlu/generative/mmlu_moral_scenarios.yaml |   3 +-
 .../tasks/mmlu/generative/mmlu_nutrition.yaml |   3 +-
 .../mmlu/generative/mmlu_philosophy.yaml      |   3 +-
 .../mmlu/generative/mmlu_prehistory.yaml      |   3 +-
 .../mmlu_professional_accounting.yaml         |   3 +-
 .../generative/mmlu_professional_law.yaml     |   3 +-
 .../mmlu_professional_medicine.yaml           |   3 +-
 .../mmlu_professional_psychology.yaml         |   3 +-
 .../generative/mmlu_public_relations.yaml     |   3 +-
 .../generative/mmlu_security_studies.yaml     |   3 +-
 .../tasks/mmlu/generative/mmlu_sociology.yaml |   3 +-
 .../generative/mmlu_us_foreign_policy.yaml    |   3 +-
 .../tasks/mmlu/generative/mmlu_virology.yaml  |   3 +-
 .../mmlu/generative/mmlu_world_religions.yaml |   3 +-
 .../advanced_ai_risk/_template_yaml           |   2 +-
 .../persona/_template_yaml                    |   2 +-
 .../sycophancy/sycophancy_on_nlp_survey.yaml  |   2 +-
 .../sycophancy_on_philpapers2020.yaml         |   2 +-
 ...sycophancy_on_political_typology_quiz.yaml |   2 +-
 .../{_template_yaml => winogenerated.yaml}    |   2 +-
 .../tasks/okapi/arc_multilingual/_arc_yaml    |   2 +-
 .../hellaswag_multilingual/_hellaswag_yaml    |   2 +-
 .../okapi/mmlu_multilingual/_default_yaml     |   2 +-
 .../_truthfulqa_mc1_yaml                      |   2 +-
 lm_eval/tasks/paloma/_paloma_template         |   2 +-
 lm_eval/tasks/paws-x/_pawsx.yaml              |  15 +
 lm_eval/tasks/paws-x/pawsx_template_yaml      |   1 -
 lm_eval/tasks/pile/pile_arxiv.yaml            |   2 -
 lm_eval/tasks/polemo2/polemo2_in.yaml         |   2 +-
 lm_eval/tasks/qa4mre/qa4mre_2011.yaml         |   2 +-
 lm_eval/tasks/qasper/bool.yaml                |   2 +-
 lm_eval/tasks/qasper/freeform.yaml            |   2 +-
 lm_eval/tasks/squad_completion/task.py        |   2 +-
 lm_eval/tasks/storycloze/storycloze_2016.yaml |   2 +-
 lm_eval/tasks/storycloze/storycloze_2018.yaml |   2 +-
 lm_eval/tasks/super_glue/README.md            |   6 +-
 lm_eval/tasks/super_glue/boolq/default.yaml   |   2 +-
 lm_eval/tasks/super_glue/boolq/seq2seq.yaml   |   2 +-
 lm_eval/tasks/super_glue/boolq/t5-prompt.yaml |   2 +-
 lm_eval/tasks/super_glue/cb/default.yaml      |   2 +-
 lm_eval/tasks/super_glue/cb/t5-prompt.yaml    |   2 +-
 lm_eval/tasks/super_glue/copa/default.yaml    |   2 +-
 lm_eval/tasks/super_glue/copa/t5-prompt.yaml  |   2 +-
 lm_eval/tasks/super_glue/multirc/default.yaml |   2 +-
 .../tasks/super_glue/multirc/t5-prompt.yaml   |   2 +-
 lm_eval/tasks/super_glue/record/default.yaml  |   2 +-
 .../tasks/super_glue/record/t5-prompt.yaml    |   2 +-
 lm_eval/tasks/super_glue/rte/default.yaml     |   2 +-
 lm_eval/tasks/super_glue/rte/t5-prompt.yaml   |   2 +-
 lm_eval/tasks/super_glue/wic/default.yaml     |   2 +-
 lm_eval/tasks/super_glue/wic/t5-prompt.yaml   |   2 +-
 lm_eval/tasks/super_glue/wsc/default.yaml     |   2 +-
 lm_eval/tasks/super_glue/wsc/t5-prompt.yaml   |   2 +-
 lm_eval/tasks/swde/task.py                    |   2 +-
 .../tasks/translation/iwslt2017_ar-en.yaml    |   3 +-
 .../tasks/translation/iwslt2017_en-ar.yaml    |   3 +-
 lm_eval/tasks/translation/wmt14_en-fr.yaml    |   3 +-
 lm_eval/tasks/translation/wmt14_fr-en.yaml    |   3 +-
 lm_eval/tasks/translation/wmt16_de-en.yaml    |   3 +-
 lm_eval/tasks/translation/wmt16_en-de.yaml    |   3 +-
 lm_eval/tasks/translation/wmt16_en-ro.yaml    |   3 +-
 lm_eval/tasks/translation/wmt16_ro-en.yaml    |   3 +-
 lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml  |   2 +-
 lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml  |   2 +-
 lm_eval/tasks/unscramble/anagrams1.yaml       |   2 +-
 lm_eval/tasks/unscramble/anagrams2.yaml       |   2 +-
 lm_eval/tasks/unscramble/cycle_letters.yaml   |   2 +-
 .../tasks/unscramble/random_insertion.yaml    |   2 +-
 lm_eval/tasks/unscramble/reversed_words.yaml  |   2 +-
 lm_eval/tasks/webqs/webqs.yaml                |   2 +-
 lm_eval/tasks/wmdp/README.md                  |   2 +-
 lm_eval/tasks/wmdp/_default_template_yaml     |   1 -
 lm_eval/tasks/wmdp/_wmdp.yaml                 |  11 +
 lm_eval/tasks/wmt2016/README.md               |   6 +-
 lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml    |   2 -
 lm_eval/tasks/xcopa/_xcopa.yaml               |  19 +
 lm_eval/tasks/xcopa/default_et.yaml           |   1 -
 lm_eval/tasks/xnli/_xnli.yaml                 |  23 ++
 lm_eval/tasks/xnli/xnli_common_yaml           |   1 -
 lm_eval/tasks/xnli_eu/README.md               |   4 +-
 lm_eval/tasks/xnli_eu/xnli_common_yaml        |   1 -
 lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml         |   2 +-
 lm_eval/tasks/xnli_eu/xnli_eu_native.yaml     |   2 +-
 lm_eval/tasks/xstorycloze/_xstorycloze.yaml   |  19 +
 lm_eval/tasks/xstorycloze/default_ar.yaml     |   1 -
 lm_eval/tasks/xwinograd/_xwinograd.yaml       |  14 +
 lm_eval/tasks/xwinograd/xwinograd_common_yaml |   2 -
 lm_eval/utils.py                              |  16 +-
 templates/new_yaml_task/README.md             |   6 +-
 tests/test_evaluator.py                       |   2 +-
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |  14 +-
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |   8 +-
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |   5 -
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |  22 ++
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |   6 +-
 tests/testyamls/test-01.yaml                  |  45 +++
 774 files changed, 3114 insertions(+), 1447 deletions(-)
 create mode 100644 lm_eval/api/group.py
 create mode 100644 lm_eval/tasks/aclue/_aclue.yaml
 create mode 100644 lm_eval/tasks/aexams/_aexams.yaml
 create mode 100644 lm_eval/tasks/agieval/agieval.yaml
 create mode 100644 lm_eval/tasks/agieval/agieval_cn.yaml
 create mode 100644 lm_eval/tasks/agieval/agieval_en.yaml
 create mode 100644 lm_eval/tasks/agieval/agieval_nous.yaml
 rename lm_eval/tasks/arabicmmlu/{arabicmmlu.yaml => _arabicmmlu.yaml} (60%)
 create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml
 create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml
 create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml
 create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml
 create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml
 rename lm_eval/tasks/arabicmmlu/{_default_template_yaml => _default_arabicmmlu_template_yaml} (92%)
 create mode 100644 lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml
 create mode 100644 lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml
 create mode 100644 lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml
 create mode 100644 lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml
 create mode 100644 lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
 create mode 100644 lm_eval/tasks/belebele/_belebele.yaml
 create mode 100644 lm_eval/tasks/blimp/_blimp.yaml
 create mode 100644 lm_eval/tasks/ceval/_ceval-valid.yaml
 create mode 100644 lm_eval/tasks/cmmlu/_cmmlu.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_arts.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_literature.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_education.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_law.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_economics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_education.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_food_science.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_journalism.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_logical.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_management.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_marketing.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_security_study.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_sociology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_virology.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_world_history.yaml
 create mode 100644 lm_eval/tasks/cmmlu/cmmlu_world_religions.yaml
 create mode 100644 lm_eval/tasks/csatqa/_csatqa.yaml
 create mode 100644 lm_eval/tasks/haerae/_haerae.yaml
 create mode 100644 lm_eval/tasks/inverse_scaling/_some_results
 create mode 100644 lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
 create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
 create mode 100644 lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
 create mode 100644 lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml
 create mode 100644 lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
 create mode 100644 lm_eval/tasks/leaderboard/musr/_musr.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_other.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
 create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
 rename lm_eval/tasks/model_written_evals/winogenerated/{_template_yaml => winogenerated.yaml} (93%)
 create mode 100644 lm_eval/tasks/paws-x/_pawsx.yaml
 create mode 100644 lm_eval/tasks/wmdp/_wmdp.yaml
 create mode 100644 lm_eval/tasks/xcopa/_xcopa.yaml
 create mode 100644 lm_eval/tasks/xnli/_xnli.yaml
 create mode 100644 lm_eval/tasks/xstorycloze/_xstorycloze.yaml
 create mode 100644 lm_eval/tasks/xwinograd/_xwinograd.yaml
 delete mode 100644 tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
 create mode 100644 tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
 create mode 100644 tests/testyamls/test-01.yaml

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 2f6d3217..23f8dd81 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -285,7 +285,7 @@ As a heuristic check:
 
 For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance!
 
-### Task name + groups (registering a task)
+### Task name + tags (registering a task)
 
 To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!
 
@@ -296,14 +296,14 @@ task: <name of the task>
 ```
 Including a task name is mandatory.
 
-It is often also convenient to label your task with several `groups`, or tags, though this field is optional:
+It is often also convenient to label your task with several `tag` values, though this field is optional:
 
 ```yaml
-group:
-  - group1
-  - group2
+tag:
+  - tag1
+  - tag2
 ```
-This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
+This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
 
 
 If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
@@ -319,7 +319,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted.
 
 ### Advanced Group Configs
 
-You can make more complete group config while also tailoring parameters for individual tasks.
+While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
+
+Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
+
+We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
+
+The most basic form of group can be defined via a YAML config similar to the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+metadata:
+  version: 1.0
+```
+
+This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
+
+
+Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
+metadata:
+  version: 1.0
+```
+
+Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
+
+**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
+
+Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our
 
 For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.
 
@@ -331,33 +372,13 @@ task:
       - cb
       - anli_r1
       - rte
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
   - task: mmlu
     num_fewshot: 2
 ```
-It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built.
-
-### Duplicate Tasks in Group Configs
-
-There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them.
-```yaml
-group: flan_held_in
-group_alias: Flan (Held-In)
-task:
-  # ANLI R1
-  - group: anli_r1_flan
-    group_alias: ANLI R1
-    task:
-      - task: anli_r1
-        task_alias: prompt-0
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nChoose your answer ..."
-        ...
-      - task: anli_r1
-        task_alias: prompt-1
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nBased on ..."
-      ...
-```
 
 ### Configuring python classes
 
@@ -384,19 +405,16 @@ task:
 
 ## Beautifying Table Display
 
-To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.
 
 ```
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
 "task_alias": "abstract_algebra"
 ```
-Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.
 
 ## Checking validity
 
@@ -416,9 +434,9 @@ a simple eye test.
 
 ## Versioning
 
-One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
 
-This version info can be provided by adding the following to your new task config file:
+This version info can be provided by adding the following to your new task or group config file:
 
 ```
 metadata:
diff --git a/docs/task_guide.md b/docs/task_guide.md
index b10ca7d9..34e47c41 100644
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields
 
 Task naming + registration:
 - **task** (`str`, defaults to None) — name of the task.
-- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once.
+- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
+- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.
 
 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
@@ -55,8 +56,6 @@ Other:
 
 ## Filters
 
-Explain: What are filters? What is their place in the pipeline?
-
 A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).
 
 After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
@@ -295,105 +294,24 @@ Generative tasks:
 Tasks using complex filtering:
 - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
 
-
-## Benchmarks
+# Group Configuration
 
 When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.
 
-To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite.
-
-```yaml
-group: pythia
-task:
-  - lambada_openai
-  - wikitext
-  - piqa
-  - sciq
-  - wsc
-  - winogrande
-  - arc
-  - logiqa
-  - blimp
-  - hendrycksTest*
-```
-
-It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark.
-
-```yaml
-group: multimedqa
-task:
-  - pubmedqa
-  - medmcqa
-  - medqa_4options
-  - task: mmlu_anatomy
-    task_alias: "anatomy (mmlu)"
-    group_alias: null
-  - task: mmlu_clinical_knowledge
-    task_alias: "clinical_knowledge (mmlu)"
-    group_alias: null
-  ...
-```
+To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.
 
-Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set.
+## Configurations
 
-```yaml
-group: t0_eval
-task:
-  # Coreference Resolution
-  - dataset_path: super_glue
-    dataset_name: wsc.fixed
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Coreference Resolution
-  - dataset_path: winogrande
-    dataset_name: winogrande_xl
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  ...
-```
+Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
 
-If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset.
-
-```YAML
-group: t0_eval
-task:
-  ...
-  - task: anli_r1
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r1
-    validation_split: dev_r1
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r2
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r2
-    validation_split: dev_r2
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-```
+### Parameters
 
-Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
+- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
+- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
+- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
+- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
+  - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
+  - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics.  **currently, only `mean` is supported.**
+  - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
+  - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.  
+- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.
diff --git a/examples/lm-eval-overview.ipynb b/examples/lm-eval-overview.ipynb
index 898192ce..6ba5cdf2 100644
--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
@@ -377,7 +377,7 @@
         "id": "LOUHK7PtQfq4"
       },
       "source": [
-        "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the group `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
+        "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
         "\n",
         "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
         "\n",
@@ -395,7 +395,7 @@
       "outputs": [],
       "source": [
         "YAML_cola_string = '''\n",
-        "group: yes_or_no_tasks\n",
+        "tag: yes_or_no_tasks\n",
         "task: demo_cola\n",
         "dataset_path: glue\n",
         "dataset_name: cola\n",
@@ -494,7 +494,6 @@
       "outputs": [],
       "source": [
         "YAML_mmlu_geo_string = '''\n",
-        "group: mmlu\n",
         "task: demo_mmlu_high_school_geography\n",
         "dataset_path: cais/mmlu\n",
         "dataset_name: high_school_geography\n",
diff --git a/lm_eval/api/group.py b/lm_eval/api/group.py
new file mode 100644
index 00000000..534e6ad0
--- /dev/null
+++ b/lm_eval/api/group.py
@@ -0,0 +1,117 @@
+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+
+
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+
+    def __post_init__(self):
+        if self.aggregation != "mean":
+            raise ValueError(
+                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
+            )
+
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+
+
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+
+
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+
+    @property
+    def group(self):
+        return self._config.group
+
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+
+    @property
+    def version(self):
+        return self._config.version
+
+    @property
+    def config(self):
+        return self._config.to_dict()
+
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+
+    def __repr__(self):
+        return (
+            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
+        )
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index ccfda509..030e6857 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -56,8 +56,8 @@ class TaskConfig(dict):
     # task naming/registry
     task: Optional[str] = None
     task_alias: Optional[str] = None
+    tag: Optional[Union[str, list]] = None
     group: Optional[Union[str, list]] = None
-    group_alias: Optional[Union[str, list]] = None
     # HF dataset options.
     # which dataset to use,
     # and what splits for what purpose
@@ -97,6 +97,18 @@ class TaskConfig(dict):
     )
 
     def __post_init__(self) -> None:
+        if self.group is not None:
+            eval_logger.warning(
+                "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
+            )
+
+            if self.tag is None:
+                self.tag = self.group
+            else:
+                raise ValueError(
+                    "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
+                )
+
         if self.generation_kwargs is not None:
             if self.output_type != "generate_until":
                 eval_logger.warning(
@@ -980,7 +992,7 @@ class ConfigurableTask(Task):
         else:
             if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                 eval_logger.warning(
-                    f"Task '{self.config.task}': "
+                    f"[Task: {self.config.task}] "
                     "num_fewshot > 0 but fewshot_split is None. "
                     "using preconfigured rule."
                 )
@@ -1520,10 +1532,13 @@ class ConfigurableTask(Task):
     def get_config(self, key: str) -> Any:
         return getattr(self._config, key, None)
 
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+
     def __repr__(self):
         return (
             f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
-            f"group_name={getattr(self.config, 'group', None)},"
             f"output_type={self.OUTPUT_TYPE},"
             f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
             f"num_samples={len(self.eval_docs)})"
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 73226b05..635810de 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,11 +11,14 @@ import torch
 
 import lm_eval.api.metrics
 import lm_eval.api.registry
+import lm_eval.api.task
 import lm_eval.models
 from lm_eval.caching.cache import delete_cache
 from lm_eval.evaluator_utils import (
+    consolidate_group_results,
     consolidate_results,
     get_sample_size,
+    get_subtask_list,
     get_task_list,
     prepare_print_tasks,
     print_writeout,
@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import (
 )
 from lm_eval.loggers import EvaluationTracker
 from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
-from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.tasks import (
+    TaskManager,
+    get_task_dict,
+)
 from lm_eval.utils import (
     eval_logger,
     handle_non_serializable,
@@ -35,7 +41,7 @@ from lm_eval.utils import (
 
 if TYPE_CHECKING:
     from lm_eval.api.model import LM
-    from lm_eval.tasks import Task
+    from lm_eval.api.task import Task
 
 
 @positional_deprecated
@@ -219,48 +225,61 @@ def simple_evaluate(
         task_manager = TaskManager(verbosity)
 
     task_dict = get_task_dict(tasks, task_manager)
-    for task_name in task_dict.keys():
-        task_obj = task_dict[task_name]
-        if isinstance(task_obj, tuple):
-            _, task_obj = task_obj
-            if task_obj is None:
-                continue
-
-        if task_obj.get_config("output_type") == "generate_until":
-            if gen_kwargs is not None:
-                task_obj.set_config(
-                    key="generation_kwargs", value=gen_kwargs, update=True
-                )
 
-        if predict_only:
-            log_samples = True
-            eval_logger.info(
-                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-            )
-            # we have to change the class properties post-hoc. This is pretty hacky.
-            task_obj.override_metric(metric_name="bypass")
+    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
+    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
+    def _adjust_config(task_dict):
+        adjusted_task_dict = {}
+        for task_name, task_obj in task_dict.items():
+            if isinstance(task_obj, dict):
+                adjusted_task_dict = {
+                    **adjusted_task_dict,
+                    **{task_name: _adjust_config(task_obj)},
+                }
 
-        # override tasks' fewshot values to the provided num_fewshot arg value
-        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
-        if num_fewshot is not None:
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
-                eval_logger.info(
-                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
-                )
             else:
-                eval_logger.warning(
-                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                if task_obj.get_config("output_type") == "generate_until":
+                    if gen_kwargs is not None:
+                        task_obj.set_config(
+                            key="generation_kwargs", value=gen_kwargs, update=True
+                        )
+
+                if predict_only:
+                    eval_logger.info(
+                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+                    )
+                    # we have to change the class properties post-hoc. This is pretty hacky.
+                    task_obj.override_metric(metric_name="bypass")
+
+                # override tasks' fewshot values to the provided num_fewshot arg value
+                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                if num_fewshot is not None:
+                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                        eval_logger.info(
+                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                        )
+                    else:
+                        eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                else:
+                    # if num_fewshot not provided, and the task does not define a default one, default to 0
+                    if (
+                        default_num_fewshot := task_obj.get_config("num_fewshot")
+                    ) is None:
+                        task_obj.set_config(key="num_fewshot", value=0)
+                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                eval_logger.info(
+                    f"Setting fewshot random generator seed to {fewshot_random_seed}"
                 )
-                task_obj.set_config(key="num_fewshot", value=num_fewshot)
-        else:
-            # if num_fewshot not provided, and the task does not define a default one, default to 0
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
-                task_obj.set_config(key="num_fewshot", value=0)
-        # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
-        task_obj.set_fewshot_seed(seed=fewshot_random_seed)
-        eval_logger.info(
-            f"Setting fewshot random generator seed to {fewshot_random_seed}"
-        )
+
+                adjusted_task_dict[task_name] = task_obj
+
+        return adjusted_task_dict
+
+    task_dict = _adjust_config(task_dict)
 
     if check_integrity:
         run_task_tests(task_list=tasks)
@@ -282,7 +301,7 @@ def simple_evaluate(
         rewrite_requests_cache=rewrite_requests_cache,
         bootstrap_iters=bootstrap_iters,
         write_out=write_out,
-        log_samples=log_samples,
+        log_samples=True if predict_only else log_samples,
         system_instruction=system_instruction,
         apply_chat_template=apply_chat_template,
         fewshot_as_multiturn=fewshot_as_multiturn,
@@ -380,7 +399,7 @@ def evaluate(
     padding_requests = defaultdict(int)
 
     # get lists of group hierarchy and each type of request
-    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    eval_tasks = get_task_list(task_dict)
     if not log_samples:
         if not all(
             "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
@@ -557,106 +576,45 @@ def evaluate(
 
         ### Calculate group metrics ###
         if bool(results):
-            for group, task_list in reversed(task_hierarchy.items()):
-                if len(task_list) == 0:
-                    # task_hierarchy entries are either
-                    # `group_name: [subtask1, subtask2, ...]`
-                    # or `task_name: []`.
-                    # we only want to operate on groups here.
-                    continue
-
-                # collect all higher_is_better values for metrics
-                # in the group's subtasks.
-                # TODO: clean this up ; unify with the below metric_list loop?
-                _higher_is_better = {}
+            results, versions, show_group_table, *_ = consolidate_group_results(
+                results, versions, task_dict
+            )
+
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
+        subtask_list = get_subtask_list(task_dict)
+
+        # collect all higher_is_better values for metrics
+        # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if (
+                len(task_list) != 0
+            ):  # subtask list will list "task_name": [] for solo tasks
                 for task in task_list:
                     for m, h in higher_is_better[task].items():
                         if m not in _higher_is_better.keys():
                             _higher_is_better[m] = h
-                    if (
-                        m in _higher_is_better
-                        and _higher_is_better[m] is not None
-                        and _higher_is_better[m] != h
-                    ):
-                        eval_logger.warning(
-                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
-                        )
-                        _higher_is_better[m] = None
-                higher_is_better[group] = _higher_is_better
-
-                # collect all metric keys used by a subtask in the group.
-                metric_list = list(
-                    {
-                        key
-                        for task in task_list
-                        for key in results[task].keys()
-                        if "_stderr" not in key and key not in ["alias", "samples"]
-                    }
-                )
-                for metric in metric_list:
-                    stderr = "_stderr,".join(metric.split(","))
-
-                    # gather metrics, sizes, and stderrs from subtasks
-                    metrics = [
-                        results[task][metric]
-                        for task in task_list
-                        if metric in results[task]
-                    ]  # TODO: copy?
-                    stderrs = [
-                        results[task][stderr]
-                        for task in task_list
-                        if stderr in results[task]
-                    ]
-                    sizes = [
-                        results[task]["samples"]
-                        for task in task_list
-                        if metric in results[task]
-                    ]
-
-                    # compute group's pooled metric and stderr
-                    results[group][metric] = (
-                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    )
-                    # TODO: calculate grouped metric using aggregation fn
-                    if "N/A" in stderrs:
-                        results[group][stderr] = "N/A"
-                    else:
-                        results[group][stderr] = (
-                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        )
-                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
-                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
-
-                    results[group]["samples"] = sum(sizes)
-
-        results_agg = defaultdict(dict)
-        groups_agg = defaultdict(dict)
-        all_tasks_list = list(task_hierarchy.keys())
-        while True:
-            add_tasks_list = list(k for k in results_agg.keys())
-            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
-            if len(left_tasks_list) == 0:
-                break
-
-            _task_hierarchy = {
-                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
-            }
-            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
-
-            results_agg = {**results_agg, **_results_agg}
-            groups_agg = {**groups_agg, **_groups_agg}
 
-        for group_name, task_list in task_hierarchy.items():
-            if task_list:
-                num_fewshot[group_name] = num_fewshot[
-                    task_list[0]
-                ]  # TODO: validate this
+                        if (
+                            m in _higher_is_better
+                            and _higher_is_better[m] is not None
+                            and _higher_is_better[m] != h
+                        ):
+                            eval_logger.warning(
+                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                            )
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
 
         results_dict = {
             "results": dict(results_agg.items()),
-            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
-            "group_subtasks": dict(reversed(task_hierarchy.items())),
+            **(
+                {"groups": dict(group_agg.items())}
+                if (bool(group_agg) & show_group_table)
+                else {}
+            ),
+            "group_subtasks": dict(reversed(subtask_list.items())),
             "configs": dict(sorted(configs.items())),
             "versions": dict(sorted(versions.items())),
             "n-shot": dict(sorted(num_fewshot.items())),
diff --git a/lm_eval/evaluator_utils.py b/lm_eval/evaluator_utils.py
index a773c92e..80ef759a 100644
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -2,9 +2,15 @@ import collections
 import math
 import pathlib
 import sys
-from typing import Dict, List, Optional, Tuple, Union
-
-from lm_eval.api import metrics
+from typing import List, Optional, Tuple, Union
+
+from lm_eval.api.group import ConfigurableGroup
+from lm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from lm_eval.api.task import Task
 from lm_eval.utils import eval_logger, positional_deprecated
 
 
@@ -98,7 +104,7 @@ class TaskOutput:
             self.agg_metrics[metric_key] = agg_fn(items)
             self.sample_len = len(items)  # TODO: same sample size for each metric?
             if isinstance(bootstrap_iters, int):
-                stderr_fn = metrics.stderr_for_metric(
+                stderr_fn = stderr_for_metric(
                     metric=agg_fn,
                     bootstrap_iters=min(bootstrap_iters, 100)
                     if metric in ["bleu", "chrf", "ter"]
@@ -116,23 +122,71 @@ class TaskOutput:
         return (
             f"TaskOutput(task_name={self.task_name}, "
             f"group_name={self.group_name}, "
-            f"version={self.version},"
-            f"n_shot={self.n_shot}"
-            f"task_alias={self.task_alias}, group_alias={self.group_alias})"
+            f"version={self.version}, "
+            f"n_shot={self.n_shot}, "
+            f"task_alias={self.task_alias}, "
+            f"group_alias={self.group_alias})"
         )
 
 
-def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
-    task_hierarchy = collections.defaultdict(list)
-    outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
-    for task_output in outputs:
-        if group_name := task_output.group_name:
-            task_hierarchy[group_name].append(task_output.task_name)
+def get_task_list(task_dict: dict) -> List[TaskOutput]:
+    outputs = []
+    for task_name, task_obj in task_dict.items():
+        if isinstance(task_obj, dict):
+            _outputs = get_task_list(task_obj)
+            outputs.extend(_outputs)
         else:
-            task_hierarchy[task_output.task_name] = []
-    # returns task_hierarchy tracking which groups contain which subtasks,
-    # and a list of TaskOutput classes for each non-group subtask
-    return task_hierarchy, [x for x in outputs if x.task]
+            task_output = TaskOutput.from_taskdict(task_name, task_obj)
+            outputs.append(task_output)
+
+    return outputs
+
+
+def get_subtask_list(task_dict, task_root=None, depth=0):
+    subtask_list = {}
+    for group_obj, task_obj in task_dict.items():
+        if isinstance(group_obj, ConfigurableGroup):
+            # group_name = group_obj.group_name
+            group_name = group_obj.group_name
+        else:
+            group_name = group_obj
+        if isinstance(task_obj, dict):
+            _subtask_list = get_subtask_list(
+                task_obj, task_root=group_name, depth=depth + 1
+            )
+            if task_root:
+                subtask_list.setdefault((task_root, depth), []).extend(
+                    [
+                        _task
+                        for (_task, _depth) in _subtask_list.keys()
+                        if (_depth - 1) == depth
+                    ]
+                )
+
+            subtask_list = {**subtask_list, **_subtask_list}
+        else:
+            if isinstance(task_obj, ConfigurableGroup):
+                # group_or_task_name = task_obj.group_name
+                group_or_task_name = task_obj.group_name
+            elif isinstance(task_obj, Task):
+                # group_or_task_name = task_obj.task_name
+                group_or_task_name = task_obj.task_name
+
+            if task_root is None:
+                subtask_list.setdefault((group_or_task_name, depth), [])
+            else:
+                subtask_list.setdefault((task_root, depth), []).append(
+                    group_or_task_name
+                )
+
+    if depth == 0:
+        _subtask_list = {}
+        for group_key, task_list in subtask_list.items():
+            group_name, depth = group_key
+            _subtask_list[group_name] = task_list
+        subtask_list = _subtask_list
+
+    return subtask_list
 
 
 def print_writeout(task) -> None:
@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
 
 
 def prepare_print_tasks(
-    task_hierarchy: dict, results: dict, tab=0
+    task_dict: dict,
+    results: dict,
+    task_depth=0,
+    group_depth=0,
 ) -> Tuple[dict, dict]:
     """
-    @param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
     value is a list of task names.
     @param results: Dictionary containing the results of each task. Each key is a
     group name and its value is a dictionary of task results.
-    @param tab: The indentation level for printing the task
+    @param task_depth: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @param group_depth: The indentation level for printing the group
     hierarchy. Default is 0.
     @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
     aggregated results for each task, and groups_agg contains aggregated results for each group.
 
     Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
     """
-    results_agg = collections.defaultdict(dict)
-    groups_agg = collections.defaultdict(dict)
-
-    (group_name, task_list), *_ = task_hierarchy.items()
-    task_list = sorted(task_list)
-
-    results_agg[group_name] = results[group_name].copy()
-    # results_agg[group_name]["tab"] = tab
-    if "samples" in results_agg[group_name]:
-        results_agg[group_name].pop("samples")
-
-    tab_string = " " * tab + "- " if tab > 0 else ""
 
-    if "alias" in results_agg[group_name]:
-        results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
-    else:
-        results_agg[group_name]["alias"] = tab_string + group_name
-
-    if len(task_list) > 0:
-        groups_agg[group_name] = results[group_name].copy()
-        # groups_agg[group_name]["tab"] = tab
-        if "samples" in groups_agg[group_name]:
-            groups_agg[group_name].pop("samples")
-
-        if "alias" in groups_agg[group_name]:
-            groups_agg[group_name]["alias"] = (
-                tab_string + groups_agg[group_name]["alias"]
+    def _sort_task_dict(task_dict):
+        """
+        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
+        Required so that we end up sorting within each sub-header correctly.
+        """
+
+        return dict(
+            sorted(
+                task_dict.items(),
+                key=lambda item: item[0].group_name
+                if isinstance(item[0], ConfigurableGroup)
+                else item[0],
             )
-        else:
-            groups_agg[group_name]["alias"] = tab_string + group_name
+        )
 
-        for task_name in task_list:
-            if task_name in task_hierarchy:
-                _task_hierarchy = {
-                    **{task_name: task_hierarchy[task_name]},
-                    **task_hierarchy,
-                }
+    task_agg = collections.defaultdict(dict)
+    group_agg = collections.defaultdict(dict)
+    task_dict = _sort_task_dict(task_dict)
+    for task_or_group_name, task_or_group_obj in task_dict.items():
+        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
+        if isinstance(task_or_group_name, ConfigurableGroup):
+            # string_name = task_or_group_name.group_name
+            name = task_or_group_name.group_name
+            from_configurable_group = True
+            task_or_group_obj = _sort_task_dict(task_or_group_obj)
+        elif isinstance(task_or_group_name, str):
+            name = task_or_group_name
+            if isinstance(task_or_group_obj, Task):
+                # string_name = task_or_group_obj.task_name
+                name = task_or_group_obj.task_name
+            from_configurable_group = False
+
+        task_agg[name] = results[name].copy()
+        if from_configurable_group:
+            if task_or_group_name.group_alias is not None:
+                alias = task_or_group_name.group_alias
             else:
-                _task_hierarchy = {
-                    **{task_name: []},
-                    **task_hierarchy,
-                }
-
-            _results_agg, _groups_agg = prepare_print_tasks(
-                _task_hierarchy, results, tab + 1
+                alias = task_or_group_name.group
+        else:
+            if "alias" in task_agg[name]:
+                alias = task_agg[name]["alias"]
+            else:
+                alias = name
+
+        task_agg[name]["alias"] = tab_string + alias
+        if "samples" in task_agg[name]:
+            task_agg[name].pop("samples")
+
+        if from_configurable_group and (" " not in results[name]):
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
+            group_agg[name] = results[name].copy()
+            group_agg[name]["alias"] = group_tab_string + alias
+            if "samples" in group_agg[name]:
+                group_agg[name].pop("samples")
+
+        if isinstance(task_or_group_obj, dict):
+            task_depth += 1
+            group_depth += 1
+            _task_agg, _group_agg = prepare_print_tasks(
+                task_or_group_obj, results, task_depth, group_depth
             )
-            results_agg = {**results_agg, **_results_agg}
-            groups_agg = {**groups_agg, **_groups_agg}
-
-    return results_agg, groups_agg
+            task_agg = {
+                **task_agg,
+                **_task_agg,
+            }
+            group_agg = {**group_agg, **_group_agg}
+            task_depth -= 1
+            group_depth -= 1
+    return task_agg, group_agg
 
 
 def consolidate_results(
@@ -261,6 +340,8 @@ def consolidate_results(
     for task_output in eval_tasks:
         if "task_alias" in (task_config := task_output.task_config):
             results[task_output.task_name]["alias"] = task_config["task_alias"]
+        else:
+            results[task_output.task_name]["alias"] = task_output.task_name
         if group_alias := task_output.group_alias:
             if group_alias not in results and (group_name := task_output.group_name):
                 results[group_name]["alias"] = group_alias
@@ -281,6 +362,145 @@ def consolidate_results(
     return results, samples, configs, versions, num_fewshot, higher_is_better
 
 
+def consolidate_group_results(
+    results,
+    versions,
+    task_dict,
+    task_root=None,
+    show_group_table=False,
+    task_aggregation_list=None,
+) -> Tuple[dict, dict, bool, Union[None,]]:
+    """
+    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
+
+    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
+
+    - results: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
+    - versions: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
+    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
+    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
+
+    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
+    In the top-level invocation of this function, task_aggregation_list is ignored.
+    """
+    if task_root is None:
+        task_root = {}
+
+    if task_aggregation_list is None:
+        task_aggregation_list = {}
+
+    for group_or_task, group_or_task_info in task_dict.items():
+        # Convert to string
+        if isinstance(group_or_task, ConfigurableGroup):
+            group_config = group_or_task.config
+            group_or_task = group_or_task.group_name
+        else:
+            group_config = None
+
+        if isinstance(group_or_task_info, Task):
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).append(
+                    group_or_task_info.task_name
+                )
+        else:
+            (
+                results,
+                versions,
+                show_group_table,
+                _task_aggregation_list,
+            ) = consolidate_group_results(
+                results,
+                versions,
+                group_or_task_info,
+                group_or_task,
+                show_group_table,
+                task_aggregation_list,
+            )
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).extend(
+                    task_aggregation_list.get(group_or_task, [])
+                )
+
+            if (group_config is None) or (
+                group_config["aggregate_metric_list"] is None
+            ):
+                results[group_or_task][" "] = " "
+                continue
+
+            if "aggregate_metric_list" in group_config:
+                agg_metric_list = group_config["aggregate_metric_list"]
+
+            show_group_table = show_group_table | bool(
+                group_config["aggregate_metric_list"]
+            )
+
+            task_list = _task_aggregation_list[group_or_task]
+
+            metric_list = list(
+                {
+                    key
+                    for task in task_list
+                    for key in results[task].keys()
+                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
+                }
+            )
+            for metric in metric_list:
+                stderr = "_stderr,".join(metric.split(","))
+
+                # gather metrics, sizes, and stderrs from subtasks
+                metrics = [
+                    results[task][metric]
+                    for task in task_list
+                    if metric in results[task]
+                ]  # TODO: copy?
+                stderrs = [
+                    results[task][stderr]
+                    for task in task_list
+                    if stderr in results[task]
+                ]
+                sizes = [
+                    results[task]["samples"]
+                    for task in task_list
+                    if metric in results[task]
+                ]
+
+                for metric_config in agg_metric_list:
+                    for filter_name in metric_config["filter_list"]:
+                        if metric != ",".join([metric_config["metric"], filter_name]):
+                            continue
+
+                        # compute group's pooled metric and stderr
+                        if metric_config["aggregation"] == "mean":
+                            aggregate_fn = aggregate_subtask_metrics
+                        else:
+                            raise ValueError(
+                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                            )
+
+                        results[group_or_task][metric] = aggregate_fn(
+                            metrics,
+                            sizes,
+                            metric_config["weight_by_size"],
+                        )
+                        # TODO: calculate groups' metrics using arbitrary agg fns
+                        if "N/A" in stderrs:
+                            results[group_or_task][stderr] = "N/A"
+                        else:
+                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
+                            results[group_or_task][stderr] = pooled_sample_stderr(
+                                stderrs, sizes
+                            )
+
+                results[group_or_task]["samples"] = sum(sizes)
+                group_metadata = group_config.get("metadata", None)
+                if group_metadata is not None:
+                    versions[group_or_task] = group_metadata.get("version", None)
+    # print(results)
+    return results, versions, show_group_table, task_aggregation_list
+
+
 @positional_deprecated
 def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
     """
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 1fd63cdb..52c280b7 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -5,7 +5,12 @@ from functools import partial
 from typing import Dict, List, Mapping, Optional, Union
 
 from lm_eval import utils
+from lm_eval.api.group import ConfigurableGroup, GroupConfig
 from lm_eval.api.task import ConfigurableTask, Task
+from lm_eval.evaluator_utils import get_subtask_list
+
+
+GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
 
 
 class TaskManager:
@@ -80,7 +85,12 @@ class TaskManager:
         return False
 
     def _name_is_task(self, name) -> bool:
-        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
+            return True
+        return False
+
+    def _name_is_tag(self, name) -> bool:
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
             return True
         return False
 
@@ -146,84 +156,113 @@ class TaskManager:
         name_or_config: Optional[Union[str, dict]] = None,
         parent_name: Optional[str] = None,
         update_config: Optional[dict] = None,
-        yaml_path: Optional[str] = None,
     ) -> Mapping:
-        def load_task(config, task, group=None, yaml_path=None):
+        def _load_task(config, task):
             if "include" in config:
-                if yaml_path is None:
-                    raise ValueError
                 config = {
                     **utils.load_yaml_config(
-                        yaml_path,
+                        yaml_path=None,
                         yaml_config={"include": config.pop("include")},
                         mode="full",
                     ),
                     **config,
                 }
             if self._config_is_python_task(config):
-                task_object = config["class"]()
+                task_object = (
+                    config["class"](config=config)
+                    if issubclass(config["class"], ConfigurableTask)
+                    else config["class"]()
+                )
+                # very scuffed: set task name here. TODO: fixme?
+                task_object.config.task = config["task"]
             else:
-                config = self._process_alias(config, group=group)
                 task_object = ConfigurableTask(config=config)
-            if group is not None:
-                task_object = (group, task_object)
+
             return {task: task_object}
 
+        def _get_group_and_subtask_from_config(config):
+            group_name = ConfigurableGroup(config=config)
+            subtask_list = []
+            for task in group_name.config["task"]:
+                if isinstance(task, str) and self._name_is_tag(task):
+                    subtask_list.extend(self._get_tasklist(task))
+                else:
+                    subtask_list.append(task)
+            return group_name, subtask_list
+
+        def _process_group_config(config, update_config=None):
+            if update_config is not None:
+                config = {**config, **update_config}
+            _update_config = {
+                k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
+            }
+            if not bool(_update_config):
+                _update_config = None
+
+            group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
+            return group_config, _update_config
+
         if isinstance(name_or_config, str):
             if update_config is not None:
                 # Process name_or_config as a dict instead
                 name_or_config = {"task": name_or_config, **update_config}
-            elif self._name_is_task(name_or_config):
+            elif self._name_is_task(name_or_config) or self._name_is_python_task(
+                name_or_config
+            ):
                 task_config = self._get_config(name_or_config)
-                return load_task(task_config, task=name_or_config, group=parent_name)
+                return _load_task(task_config, task=name_or_config)
             else:
-                group_name = name_or_config
                 subtask_list = self._get_tasklist(name_or_config)
                 if subtask_list == -1:
                     group_config = self._get_config(name_or_config)
-                    subtask_list = group_config["task"]
-
-                # This checks if we're at the root.
-                if parent_name is None:
-                    group_config = self._get_config(name_or_config)
-                    if set(group_config.keys()) > {"task", "group"}:
-                        update_config = {
-                            k: v
-                            for k, v in group_config.items()
-                            if k not in ["task", "group"]
-                        }
-                    yaml_path = self._get_yaml_path(group_name)
-
-                    if (update_config is not None) and ("group_alias" in update_config):
-                        group_name = update_config["group_alias"]
-                        update_config.pop("group_alias")
+                    group_config, update_config = _process_group_config(group_config)
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                else:
+                    if self._name_is_tag(name_or_config):
+                        fn = partial(
+                            self._load_individual_task_or_group,
+                            update_config=name_or_config
+                            if isinstance(name_or_config, dict)
+                            else None,
+                        )
+                        return dict(
+                            collections.ChainMap(*map(fn, reversed(subtask_list)))
+                        )
+                    else:
+                        group_name = ConfigurableGroup(
+                            config={"group": name_or_config, "task": subtask_list}
+                        )
 
         if isinstance(name_or_config, dict):
-            if update_config is not None:
-                name_or_config = {
-                    **name_or_config,
-                    **update_config,
-                }
-
             if self._config_is_task(name_or_config):
-                name = name_or_config["task"]
+                name = name_or_config.pop("task")
+                if update_config is not None:
+                    name_or_config = {**name_or_config, **update_config}
                 # If the name is registered as a group
-                # if self._name_is_task(name) is False:
                 if self._name_is_group(name):
-                    group_name = name
-                    update_config = {
-                        k: v for k, v in name_or_config.items() if k != "task"
-                    }
+                    group_config = self._get_config(name)
+
+                    group_config, update_config = _process_group_config(
+                        group_config, name_or_config
+                    )
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                elif self._name_is_tag(name):
                     subtask_list = self._get_tasklist(name)
-                    if subtask_list == -1:
-                        subtask_list = self._get_config(name)["task"]
+                    fn = partial(
+                        self._load_individual_task_or_group,
+                        update_config=name_or_config,
+                    )
+                    return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
                 else:
                     if self._name_is_registered(name):
                         base_task_config = self._get_config(name)
 
                         # Check if this is a duplicate.
                         if parent_name is not None:
-                            name_or_config["group"] = parent_name
                             num_duplicate = len(
                                 list(
                                     filter(
@@ -242,34 +281,21 @@ class TaskManager:
                         }
                     else:
                         task_config = name_or_config
-                    return load_task(
-                        task_config, task=name, group=parent_name, yaml_path=yaml_path
-                    )
+                    return _load_task(task_config, task=name)
             else:
-                group_name = name_or_config["group"]
-                subtask_list = name_or_config["task"]
-                if set(name_or_config.keys()) > {"task", "group"}:
-                    update_config = {
-                        k: v
-                        for k, v in name_or_config.items()
-                        if k not in ["task", "group"]
-                    }
-
-        all_subtasks = {}
-        if parent_name is not None:
-            all_subtasks = {group_name: (parent_name, None)}
+                group_config, update_config = _process_group_config(name_or_config)
+                group_name, subtask_list = _get_group_and_subtask_from_config(
+                    group_config
+                )
 
         fn = partial(
             self._load_individual_task_or_group,
             parent_name=group_name,
             update_config=update_config,
-            yaml_path=yaml_path,
         )
-        all_subtasks = {
-            **all_subtasks,
-            **dict(collections.ChainMap(*map(fn, subtask_list))),
+        return {
+            group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
         }
-        return all_subtasks
 
     def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
         """Loads a dictionary of task objects from a list
@@ -293,10 +319,11 @@ class TaskManager:
 
     def _get_task_and_group(self, task_dir: str):
         """Creates a dictionary of tasks index with the following metadata,
-        - `type`, that can be either `task`, `python_task`, or `group`.
+        - `type`, that can be either `task`, `python_task`, `group` or `tags`.
             `task` refer to regular task configs, `python_task` are special
             yaml files that only consists of `task` and `class` parameters.
-            `group` are group configs.
+            `group` are group configs. `tags` are labels that can be assigned
+            to tasks to assist in sorting and calling tasks of certain themes.
         - `yaml_path`, path to the yaml file. If the entry is a `group` that
             was configured through a task config, the yaml_path will be -1
             and all subtasks will be listed in `task` (see below)
@@ -312,6 +339,8 @@ class TaskManager:
         :return
             Dictionary of task names as key and task metadata
         """
+        # TODO: remove group in next release
+        print_info = True
         ignore_dirs = [
             "__pycache__",
             ".ipynb_checkpoints",
@@ -358,20 +387,38 @@ class TaskManager:
                             "yaml_path": yaml_path,
                         }
 
-                        if "group" in config:
-                            groups = config["group"]
-                            if isinstance(config["group"], str):
-                                groups = [groups]
-
-                            for group in groups:
-                                if group not in tasks_and_groups:
-                                    tasks_and_groups[group] = {
-                                        "type": "group",
-                                        "task": [task],
-                                        "yaml_path": -1,
-                                    }
-                                else:
-                                    tasks_and_groups[group]["task"].append(task)
+                        # TODO: remove group in next release
+                        for attr in ["tag", "group"]:
+                            if attr in config:
+                                if attr == "group" and print_info:
+                                    self.logger.info(
+                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
+                                        "`tag` will be used to allow to call a collection of tasks just like `group`. "
+                                        "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
+                                        "which will be the offical way to create groups with addition of group-wide configuations."
+                                    )
+                                    print_info = False
+                                    # attr = "tag"
+
+                                attr_list = config[attr]
+                                if isinstance(attr_list, str):
+                                    attr_list = [attr_list]
+
+                                for tag in attr_list:
+                                    if tag not in tasks_and_groups:
+                                        tasks_and_groups[tag] = {
+                                            "type": "tag",
+                                            "task": [task],
+                                            "yaml_path": -1,
+                                        }
+                                    elif tasks_and_groups[tag]["type"] != "tag":
+                                        self.logger.info(
+                                            f"The tag {tag} is already registered as a group, this tag will not be registered. "
+                                            "This may affect tasks you want to call."
+                                        )
+                                        break
+                                    else:
+                                        tasks_and_groups[tag]["task"].append(task)
                     else:
                         self.logger.debug(f"File {f} in {root} could not be loaded")
 
@@ -400,6 +447,33 @@ def get_task_name_from_object(task_object):
     )
 
 
+def _check_duplicates(task_dict: dict) -> List[str]:
+    """helper function solely used in validating get_task_dict output.
+    Takes the output of lm_eval.evaluator_utils.get_subtask_list and
+    returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
+    "oversubscribed" to several disjoint groups.
+    """
+    subtask_names = []
+    for key, value in task_dict.items():
+        subtask_names.extend(value)
+
+    duplicate_tasks = {
+        task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
+    }
+
+    # locate the potentially problematic groups that seem to 'compete' for constituent subtasks
+    competing_groups = [
+        group
+        for group in task_dict.keys()
+        if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
+    ]
+
+    if len(duplicate_tasks) > 0:
+        raise ValueError(
+            f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
+        )
+
+
 def get_task_dict(
     task_name_list: Union[str, List[Union[str, Dict, Task]]],
     task_manager: Optional[TaskManager] = None,
@@ -417,6 +491,7 @@ def get_task_dict(
     :return
         Dictionary of task objects
     """
+
     task_name_from_string_dict = {}
     task_name_from_config_dict = {}
     task_name_from_object_dict = {}
@@ -463,8 +538,16 @@ def get_task_dict(
     ):
         raise ValueError
 
-    return {
+    final_task_dict = {
         **task_name_from_string_dict,
         **task_name_from_config_dict,
         **task_name_from_object_dict,
     }
+
+    # behavior can get odd if one tries to invoke several groups that "compete" for the same task.
+    # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
+    # and we'd be unsure which to use and report.)
+    # we explicitly check and error in this case.
+    _check_duplicates(get_subtask_list(final_task_dict))
+
+    return final_task_dict
diff --git a/lm_eval/tasks/aclue/README.md b/lm_eval/tasks/aclue/README.md
index d8707c01..5e218e59 100644
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
diff --git a/lm_eval/tasks/aclue/_aclue.yaml b/lm_eval/tasks/aclue/_aclue.yaml
new file mode 100644
index 00000000..2c898788
--- /dev/null
+++ b/lm_eval/tasks/aclue/_aclue.yaml
@@ -0,0 +1,26 @@
+group: aclue
+task:
+  - aclue_ancient_chinese_culture
+  - aclue_ancient_literature
+  - aclue_ancient_medical
+  - aclue_ancient_phonetics
+  - aclue_basic_ancient_chinese
+  - aclue_couplet_prediction
+  - aclue_homographic_character_resolution
+  - aclue_named_entity_recognition
+  - aclue_poetry_appreciate
+  - aclue_poetry_context_prediction
+  - aclue_poetry_quality_assessment
+  - aclue_poetry_sentiment_analysis
+  - aclue_polysemy_resolution
+  - aclue_reading_comprehension
+  - aclue_sentence_segmentation
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/aclue/_default_template_yaml b/lm_eval/tasks/aclue/_default_template_yaml
index d93ce382..7d87fb58 100644
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
@@ -1,4 +1,3 @@
-group: aclue
 dataset_path: tyouisen/aclue
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/aexams/README.md b/lm_eval/tasks/aexams/README.md
index 799c6c1f..0c046192 100644
--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
 ### Citation
 
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
-- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
 
 #### Tasks
 
diff --git a/lm_eval/tasks/aexams/_aexams.yaml b/lm_eval/tasks/aexams/_aexams.yaml
new file mode 100644
index 00000000..e626d7a5
--- /dev/null
+++ b/lm_eval/tasks/aexams/_aexams.yaml
@@ -0,0 +1,16 @@
+group: aexams
+task:
+  - aexams_Biology
+  - aexams_IslamicStudies
+  - aexams_Physics
+  - aexams_Science
+  - aexams_Social
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/aexams/_default_template_yaml b/lm_eval/tasks/aexams/_default_template_yaml
index 12438deb..44b29380 100644
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
@@ -1,4 +1,3 @@
-group: aexams
 dataset_path: Hennara/aexams
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/agieval/README.md b/lm_eval/tasks/agieval/README.md
index faaf47b6..53a9df03 100644
--- a/lm_eval/tasks/agieval/README.md
+++ b/lm_eval/tasks/agieval/README.md
@@ -75,7 +75,7 @@ Please make sure to cite all the individual datasets in your paper when you use
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
@@ -89,6 +89,10 @@ Please make sure to cite all the individual datasets in your paper when you use
 
 - `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
 
+#### Tags
+
+None.
+
 #### Tasks
 
 - `agieval_aqua_rat`
diff --git a/lm_eval/tasks/agieval/agieval.yaml b/lm_eval/tasks/agieval/agieval.yaml
new file mode 100644
index 00000000..d086af83
--- /dev/null
+++ b/lm_eval/tasks/agieval/agieval.yaml
@@ -0,0 +1,29 @@
+group: agieval
+task:
+  - agieval_gaokao_biology
+  - agieval_gaokao_chemistry
+  - agieval_gaokao_chinese
+  - agieval_gaokao_geography
+  - agieval_gaokao_history
+  - agieval_gaokao_mathcloze
+  - agieval_gaokao_mathqa
+  - agieval_gaokao_physics
+  - agieval_jec_qa_ca
+  - agieval_jec_qa_kd
+  - agieval_logiqa_zh
+  - agieval_aqua_rat
+  - agieval_gaokao_english
+  - agieval_logiqa_en
+  - agieval_lsat_ar
+  - agieval_lsat_lr
+  - agieval_lsat_rc
+  - agieval_math
+  - agieval_sat_en_without_passage
+  - agieval_sat_en
+  - agieval_sat_math
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/agieval/agieval_cn.yaml b/lm_eval/tasks/agieval/agieval_cn.yaml
new file mode 100644
index 00000000..0e8ca2fd
--- /dev/null
+++ b/lm_eval/tasks/agieval/agieval_cn.yaml
@@ -0,0 +1,19 @@
+group: agieval_cn
+task:
+  - agieval_gaokao_biology
+  - agieval_gaokao_chemistry
+  - agieval_gaokao_chinese
+  - agieval_gaokao_geography
+  - agieval_gaokao_history
+  - agieval_gaokao_mathcloze
+  - agieval_gaokao_mathqa
+  - agieval_gaokao_physics
+  - agieval_jec_qa_ca
+  - agieval_jec_qa_kd
+  - agieval_logiqa_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/agieval/agieval_en.yaml b/lm_eval/tasks/agieval/agieval_en.yaml
new file mode 100644
index 00000000..0a873d66
--- /dev/null
+++ b/lm_eval/tasks/agieval/agieval_en.yaml
@@ -0,0 +1,18 @@
+group: agieval_en
+task:
+  - agieval_aqua_rat
+  - agieval_gaokao_english # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
+  - agieval_logiqa_en
+  - agieval_lsat_ar
+  - agieval_lsat_lr
+  - agieval_lsat_rc
+  - agieval_math
+  - agieval_sat_en_without_passage
+  - agieval_sat_en
+  - agieval_sat_math
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/agieval/agieval_nous.yaml b/lm_eval/tasks/agieval/agieval_nous.yaml
new file mode 100644
index 00000000..aa2a8748
--- /dev/null
+++ b/lm_eval/tasks/agieval/agieval_nous.yaml
@@ -0,0 +1,16 @@
+group: agieval_nous
+task:
+  - agieval_aqua_rat
+  - agieval_logiqa_en
+  - agieval_lsat_ar
+  - agieval_lsat_lr
+  - agieval_lsat_rc
+  - agieval_sat_en_without_passage
+  - agieval_sat_en
+  - agieval_sat_math
+aggregate_metric_list:
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/agieval/aqua-rat.yaml b/lm_eval/tasks/agieval/aqua-rat.yaml
index babebf63..e5a3a3e8 100644
--- a/lm_eval/tasks/agieval/aqua-rat.yaml
+++ b/lm_eval/tasks/agieval/aqua-rat.yaml
@@ -1,7 +1,3 @@
-group:
-  - agieval
-  - agieval_en
-  - agieval_nous
 task: agieval_aqua_rat
 dataset_path: hails/agieval-aqua-rat
 dataset_name: null
diff --git a/lm_eval/tasks/agieval/gaokao-biology.yaml b/lm_eval/tasks/agieval/gaokao-biology.yaml
index 36c44cbb..b8b9eca8 100644
--- a/lm_eval/tasks/agieval/gaokao-biology.yaml
+++ b/lm_eval/tasks/agieval/gaokao-biology.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_biology
 dataset_path: hails/agieval-gaokao-biology
diff --git a/lm_eval/tasks/agieval/gaokao-chemistry.yaml b/lm_eval/tasks/agieval/gaokao-chemistry.yaml
index 69810122..a4ba01a5 100644
--- a/lm_eval/tasks/agieval/gaokao-chemistry.yaml
+++ b/lm_eval/tasks/agieval/gaokao-chemistry.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_chemistry
 dataset_path: hails/agieval-gaokao-chemistry
diff --git a/lm_eval/tasks/agieval/gaokao-chinese.yaml b/lm_eval/tasks/agieval/gaokao-chinese.yaml
index 30d249b9..3d58b5bc 100644
--- a/lm_eval/tasks/agieval/gaokao-chinese.yaml
+++ b/lm_eval/tasks/agieval/gaokao-chinese.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_chinese
 dataset_path: hails/agieval-gaokao-chinese
diff --git a/lm_eval/tasks/agieval/gaokao-english.yaml b/lm_eval/tasks/agieval/gaokao-english.yaml
index a540fcf2..12ea6678 100644
--- a/lm_eval/tasks/agieval/gaokao-english.yaml
+++ b/lm_eval/tasks/agieval/gaokao-english.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
 task: agieval_gaokao_english
 dataset_path: hails/agieval-gaokao-english
diff --git a/lm_eval/tasks/agieval/gaokao-geography.yaml b/lm_eval/tasks/agieval/gaokao-geography.yaml
index 2fe43bfd..6dbce6f4 100644
--- a/lm_eval/tasks/agieval/gaokao-geography.yaml
+++ b/lm_eval/tasks/agieval/gaokao-geography.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_geography
 dataset_path: hails/agieval-gaokao-geography
diff --git a/lm_eval/tasks/agieval/gaokao-history.yaml b/lm_eval/tasks/agieval/gaokao-history.yaml
index b9c9c630..55b31728 100644
--- a/lm_eval/tasks/agieval/gaokao-history.yaml
+++ b/lm_eval/tasks/agieval/gaokao-history.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_history
 dataset_path: hails/agieval-gaokao-history
diff --git a/lm_eval/tasks/agieval/gaokao-mathcloze.yaml b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml
index 74cbad1c..8692e2f4 100644
--- a/lm_eval/tasks/agieval/gaokao-mathcloze.yaml
+++ b/lm_eval/tasks/agieval/gaokao-mathcloze.yaml
@@ -1,6 +1,3 @@
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_mathcloze
 dataset_path: hails/agieval-gaokao-mathcloze
 dataset_name: null
diff --git a/lm_eval/tasks/agieval/gaokao-mathqa.yaml b/lm_eval/tasks/agieval/gaokao-mathqa.yaml
index aa94e8ee..e0d97a51 100644
--- a/lm_eval/tasks/agieval/gaokao-mathqa.yaml
+++ b/lm_eval/tasks/agieval/gaokao-mathqa.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_mathqa
 dataset_path: hails/agieval-gaokao-mathqa
diff --git a/lm_eval/tasks/agieval/gaokao-physics.yaml b/lm_eval/tasks/agieval/gaokao-physics.yaml
index 175dd6cc..43a047ed 100644
--- a/lm_eval/tasks/agieval/gaokao-physics.yaml
+++ b/lm_eval/tasks/agieval/gaokao-physics.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_gaokao_physics
 dataset_path: hails/agieval-gaokao-physics
diff --git a/lm_eval/tasks/agieval/jec-qa-ca.yaml b/lm_eval/tasks/agieval/jec-qa-ca.yaml
index f93b47a5..2d09c2b3 100644
--- a/lm_eval/tasks/agieval/jec-qa-ca.yaml
+++ b/lm_eval/tasks/agieval/jec-qa-ca.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_jec_qa_ca
 dataset_path: hails/agieval-jec-qa-ca
diff --git a/lm_eval/tasks/agieval/jec-qa-kd.yaml b/lm_eval/tasks/agieval/jec-qa-kd.yaml
index 0458eb7e..5843b6de 100644
--- a/lm_eval/tasks/agieval/jec-qa-kd.yaml
+++ b/lm_eval/tasks/agieval/jec-qa-kd.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_jec_qa_kd
 dataset_path: hails/agieval-jec-qa-kd
diff --git a/lm_eval/tasks/agieval/logiqa-en.yaml b/lm_eval/tasks/agieval/logiqa-en.yaml
index 71124186..5bd1dff4 100644
--- a/lm_eval/tasks/agieval/logiqa-en.yaml
+++ b/lm_eval/tasks/agieval/logiqa-en.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_logiqa_en
 dataset_path: hails/agieval-logiqa-en
diff --git a/lm_eval/tasks/agieval/logiqa-zh.yaml b/lm_eval/tasks/agieval/logiqa-zh.yaml
index 82e68800..2ca9198b 100644
--- a/lm_eval/tasks/agieval/logiqa-zh.yaml
+++ b/lm_eval/tasks/agieval/logiqa-zh.yaml
@@ -1,6 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_cn
 task: agieval_logiqa_zh
 dataset_path: hails/agieval-logiqa-zh
diff --git a/lm_eval/tasks/agieval/lsat-ar.yaml b/lm_eval/tasks/agieval/lsat-ar.yaml
index 302f9b51..2284f601 100644
--- a/lm_eval/tasks/agieval/lsat-ar.yaml
+++ b/lm_eval/tasks/agieval/lsat-ar.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_lsat_ar
 dataset_path: hails/agieval-lsat-ar
diff --git a/lm_eval/tasks/agieval/lsat-lr.yaml b/lm_eval/tasks/agieval/lsat-lr.yaml
index 62158e5c..8505d446 100644
--- a/lm_eval/tasks/agieval/lsat-lr.yaml
+++ b/lm_eval/tasks/agieval/lsat-lr.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_lsat_lr
 dataset_path: hails/agieval-lsat-lr
diff --git a/lm_eval/tasks/agieval/lsat-rc.yaml b/lm_eval/tasks/agieval/lsat-rc.yaml
index de155af7..23a9dce7 100644
--- a/lm_eval/tasks/agieval/lsat-rc.yaml
+++ b/lm_eval/tasks/agieval/lsat-rc.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_lsat_rc
 dataset_path: hails/agieval-lsat-rc
diff --git a/lm_eval/tasks/agieval/math.yaml b/lm_eval/tasks/agieval/math.yaml
index c8ec9eec..534a2e66 100644
--- a/lm_eval/tasks/agieval/math.yaml
+++ b/lm_eval/tasks/agieval/math.yaml
@@ -1,6 +1,3 @@
-group:
-  - agieval
-  - agieval_en
 task: agieval_math
 dataset_path: hails/agieval-math
 dataset_name: null
diff --git a/lm_eval/tasks/agieval/sat-en-without-passage.yaml b/lm_eval/tasks/agieval/sat-en-without-passage.yaml
index 01490d9e..d36b467c 100644
--- a/lm_eval/tasks/agieval/sat-en-without-passage.yaml
+++ b/lm_eval/tasks/agieval/sat-en-without-passage.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_sat_en_without_passage
 dataset_path: hails/agieval-sat-en-without-passage
diff --git a/lm_eval/tasks/agieval/sat-en.yaml b/lm_eval/tasks/agieval/sat-en.yaml
index a45dba15..793d48ae 100644
--- a/lm_eval/tasks/agieval/sat-en.yaml
+++ b/lm_eval/tasks/agieval/sat-en.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_sat_en
 dataset_path: hails/agieval-sat-en
diff --git a/lm_eval/tasks/agieval/sat-math.yaml b/lm_eval/tasks/agieval/sat-math.yaml
index f5b644ee..228add09 100644
--- a/lm_eval/tasks/agieval/sat-math.yaml
+++ b/lm_eval/tasks/agieval/sat-math.yaml
@@ -1,7 +1,3 @@
 include: aqua-rat.yaml
-group:
-  - agieval
-  - agieval_nous
-  - agieval_en
 task: agieval_sat_math
 dataset_path: hails/agieval-sat-math
diff --git a/lm_eval/tasks/anli/anli_r1.yaml b/lm_eval/tasks/anli/anli_r1.yaml
index bcf7674e..2de1d259 100644
--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - anli
 task: anli_r1
 dataset_path: anli
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml
similarity index 60%
rename from lm_eval/tasks/arabicmmlu/arabicmmlu.yaml
rename to lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml
index 89e9b3d5..58cf795a 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml
@@ -5,3 +5,8 @@ task:
 - arabicmmlu_humanities
 - arabicmmlu_stem
 - arabicmmlu_language
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml
new file mode 100644
index 00000000..6f61004a
--- /dev/null
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml
@@ -0,0 +1,9 @@
+group: arabicmmlu_humanities
+group_alias: Humanities
+task:
+  - arabicmmlu_humanities_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml
new file mode 100644
index 00000000..90e57ae0
--- /dev/null
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml
@@ -0,0 +1,9 @@
+group: arabicmmlu_language
+group_alias: Language
+task:
+  - arabicmmlu_language_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml
new file mode 100644
index 00000000..3e989b8c
--- /dev/null
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml
@@ -0,0 +1,9 @@
+group: arabicmmlu_other
+group_alias: Other
+task:
+  - arabicmmlu_other_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml
new file mode 100644
index 00000000..1ece047b
--- /dev/null
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml
@@ -0,0 +1,9 @@
+group: arabicmmlu_social_science
+group_alias: Social Science
+task:
+  - arabicmmlu_social_science_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml b/lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml
new file mode 100644
index 00000000..a464a62a
--- /dev/null
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml
@@ -0,0 +1,9 @@
+group: arabicmmlu_stem
+group_alias: STEM
+task:
+  - arabicmmlu_stem_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/arabicmmlu/_default_template_yaml b/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml
similarity index 92%
rename from lm_eval/tasks/arabicmmlu/_default_template_yaml
rename to lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml
index 9af55ba7..eac23577 100644
--- a/lm_eval/tasks/arabicmmlu/_default_template_yaml
+++ b/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml
@@ -11,3 +11,5 @@ metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/arabicmmlu/_generate_configs.py b/lm_eval/tasks/arabicmmlu/_generate_configs.py
index 83e6594e..4d091e12 100644
--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -59,7 +59,7 @@ SUBJECTS = {
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--base_yaml_path", default="_default_template_yaml")
+    parser.add_argument("--base_yaml_path", default="_default_arabicmmlu_template_yaml")
     parser.add_argument("--save_prefix_path", default="arabicmmlu")
     return parser.parse_args()
 
@@ -81,8 +81,7 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
-            "group": f"arabicmmlu_{category}",
-            "group_alias": category.replace("_", " "),
+            "tag": f"arabicmmlu_{category}",
             "task": f"arabicmmlu_{subject.lower().replace(' ', '_')}",
             "task_alias": subject,
             "dataset_name": subject,
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml
index 393109aa..f57dc08c 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Arabic Language (General)"
-"group": "arabicmmlu_language"
-"group_alias": "language"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_language_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_arabic_language_(general)"
 "task_alias": "Arabic Language (General)"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml
index b077a398..baf32676 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Arabic Language (Grammar)"
-"group": "arabicmmlu_language"
-"group_alias": "language"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_language_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_arabic_language_(grammar)"
 "task_alias": "Arabic Language (Grammar)"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml
index 488633fd..d40c9eb9 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Driving Test"
-"group": "arabicmmlu_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_other_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_driving_test"
 "task_alias": "Driving Test"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml
index b328b4f9..fbd8839d 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "General Knowledge"
-"group": "arabicmmlu_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_other_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_general_knowledge"
 "task_alias": "General Knowledge"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml
index 3d0005a2..17d17bc8 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Arabic Language"
-"group": "arabicmmlu_language"
-"group_alias": "language"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_language_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_arabic_language"
 "task_alias": "High Arabic Language"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml
index 1fc0b6a5..2b5baf0b 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Biology"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_biology"
 "task_alias": "High Biology"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml
index c3bb13d5..87050922 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Civics"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_civics"
 "task_alias": "High Civics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml
index d85f2f8a..f1a66a5c 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Computer Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_computer_science"
 "task_alias": "High Computer Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml
index da2519d9..a1d6e90f 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Economics"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_economics"
 "task_alias": "High Economics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml
index 88ec5a02..ad980432 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Geography"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_geography"
 "task_alias": "High Geography"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml
index 6cdfdefe..49c82669 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High History"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_history"
 "task_alias": "High History"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml
index 23a7834f..15b5358b 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Islamic Studies"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_islamic_studies"
 "task_alias": "High Islamic Studies"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml
index 0f61e3c7..e0b20e30 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Philosophy"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_philosophy"
 "task_alias": "High Philosophy"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml
index 2e510b0d..a7fe5ecc 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "High Physics"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_high_physics"
 "task_alias": "High Physics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml
index d97bbdcc..bacd5ace 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Islamic Studies"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_islamic_studies"
 "task_alias": "Islamic Studies"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml
index efb6dd42..14a2ab1a 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Arabic Language"
-"group": "arabicmmlu_language"
-"group_alias": "language"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_language_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_arabic_language"
 "task_alias": "Middle Arabic Language"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml
index 9de4060b..44ba95d4 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Civics"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_civics"
 "task_alias": "Middle Civics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml
index 635d09f6..8dd4136f 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Computer Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_computer_science"
 "task_alias": "Middle Computer Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml
index ef55891e..312fa2e3 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Economics"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_economics"
 "task_alias": "Middle Economics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml
index 993fd832..c359d85a 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle General Knowledge"
-"group": "arabicmmlu_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_other_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_general_knowledge"
 "task_alias": "Middle General Knowledge"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml
index 2c84a47b..111b13cf 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Geography"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_geography"
 "task_alias": "Middle Geography"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml
index dcb84cb0..615a2e51 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle History"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_history"
 "task_alias": "Middle History"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml
index 273b044f..44922360 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Islamic Studies"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_islamic_studies"
 "task_alias": "Middle Islamic Studies"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml
index 6c7b8109..265cdbaa 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Natural Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_natural_science"
 "task_alias": "Middle Natural Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml
index 1b71ad5f..84c247dd 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Middle Social Science"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_middle_social_science"
 "task_alias": "Middle Social Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml
index 3f13179b..700bc078 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Arabic Language"
-"group": "arabicmmlu_language"
-"group_alias": "language"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_language_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_arabic_language"
 "task_alias": "Primary Arabic Language"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml
index d8d92318..b89089cd 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Computer Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_computer_science"
 "task_alias": "Primary Computer Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml
index 57472db3..85dd0b7f 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary General Knowledge"
-"group": "arabicmmlu_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_other_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_general_knowledge"
 "task_alias": "Primary General Knowledge"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml
index 66c01fe8..f7efc487 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Geography"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_geography"
 "task_alias": "Primary Geography"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml
index b8c913a4..f7d69ca9 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary History"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_history"
 "task_alias": "Primary History"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml
index a1c12653..b36cd640 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Islamic Studies"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_islamic_studies"
 "task_alias": "Primary Islamic Studies"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml
index d908d580..0e53adcf 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Math"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_math"
 "task_alias": "Primary Math"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml
index d5a513cf..4e208c76 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Natural Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_natural_science"
 "task_alias": "Primary Natural Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml
index bd4cfc29..fee4fe5d 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Primary Social Science"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_primary_social_science"
 "task_alias": "Primary Social Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml
index bd1252d0..20bf6c5f 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Prof Law"
-"group": "arabicmmlu_humanities"
-"group_alias": "humanities"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_humanities_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_prof_law"
 "task_alias": "Prof Law"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml
index 2d1f3928..6d1d9412 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Univ Accounting"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_univ_accounting"
 "task_alias": "Univ Accounting"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml
index bf1725ef..42e7e89a 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Univ Computer Science"
-"group": "arabicmmlu_stem"
-"group_alias": "stem"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_stem_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_univ_computer_science"
 "task_alias": "Univ Computer Science"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml
index ed1494e5..21015ffa 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Univ Economics"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_univ_economics"
 "task_alias": "Univ Economics"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml
index d72e3532..e69ad74b 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Univ Management"
-"group": "arabicmmlu_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_other_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_univ_management"
 "task_alias": "Univ Management"
diff --git a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml
index fe32b63b..bb85a104 100644
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml
@@ -1,6 +1,5 @@
 "dataset_name": "Univ Political Science"
-"group": "arabicmmlu_social_science"
-"group_alias": "social science"
-"include": "_default_template_yaml"
+"tag": "arabicmmlu_social_science_tasks"
+"include": "_default_arabicmmlu_template_yaml"
 "task": "arabicmmlu_univ_political_science"
 "task_alias": "Univ Political Science"
diff --git a/lm_eval/tasks/arc/README.md b/lm_eval/tasks/arc/README.md
index b3eea45b..2677d4c1 100644
--- a/lm_eval/tasks/arc/README.md
+++ b/lm_eval/tasks/arc/README.md
@@ -29,10 +29,14 @@ Homepage: https://allenai.org/data/arc
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
+None.
+
+#### Tags
+
 * `ai2_arc`: Evaluates `arc_easy` and `arc_challenge`
 
 #### Tasks
diff --git a/lm_eval/tasks/arc/arc_easy.yaml b/lm_eval/tasks/arc/arc_easy.yaml
index b07629d3..1b2e369a 100644
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - ai2_arc
 task: arc_easy
 dataset_path: allenai/ai2_arc
diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
index d2032e34..6a17c5c1 100644
--- a/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - arc_challenge_mt
 task: arc_challenge_mt_fi
 dataset_path: LumiOpen/arc_challenge_mt
diff --git a/lm_eval/tasks/arithmetic/README.md b/lm_eval/tasks/arithmetic/README.md
index e7d7f89e..7bfc27cb 100644
--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -27,9 +27,9 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
-#### Groups
+#### Tags
 
 * `arithmetic`: Evaluates `1dc` to `5ds`
 
diff --git a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
index 3e8d414a..2e9c9277 100644
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - arithmetic
 task: arithmetic_1dc
 dataset_path: EleutherAI/arithmetic
diff --git a/lm_eval/tasks/asdiv/README.md b/lm_eval/tasks/asdiv/README.md
index e6e5aeec..46ca56c0 100644
--- a/lm_eval/tasks/asdiv/README.md
+++ b/lm_eval/tasks/asdiv/README.md
@@ -32,7 +32,7 @@ Homepage: https://github.com/chaochun/nlu-asdiv-dataset
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
diff --git a/lm_eval/tasks/babi/README.md b/lm_eval/tasks/babi/README.md
index a19798ab..4943d08b 100644
--- a/lm_eval/tasks/babi/README.md
+++ b/lm_eval/tasks/babi/README.md
@@ -21,12 +21,16 @@ Homepage: https://github.com/facebookarchive/bAbI-tasks
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
 * Not part of a group yet
 
+#### Tags
+
+* No tags applied.
+
 #### Tasks
 
 * `babi`
diff --git a/lm_eval/tasks/basqueglue/README.md b/lm_eval/tasks/basqueglue/README.md
index 9ba47b22..56c9ba28 100644
--- a/lm_eval/tasks/basqueglue/README.md
+++ b/lm_eval/tasks/basqueglue/README.md
@@ -43,11 +43,15 @@ Homepage: `https://github.com/hitz-zentroa/latxa`
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
-* `basque-glue`: First version of the implementation
+None.
+
+#### Tags
+
+* `basque-glue`: First version of the implementation. Calls all subtasks, but does not average.
 
 #### Tasks
 
diff --git a/lm_eval/tasks/basqueglue/bec.yaml b/lm_eval/tasks/basqueglue/bec.yaml
index fd15aea4..87d29aa6 100644
--- a/lm_eval/tasks/basqueglue/bec.yaml
+++ b/lm_eval/tasks/basqueglue/bec.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: bec2016eu
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: bec
diff --git a/lm_eval/tasks/basqueglue/bhtc.yaml b/lm_eval/tasks/basqueglue/bhtc.yaml
index f3c5c432..29b0a494 100644
--- a/lm_eval/tasks/basqueglue/bhtc.yaml
+++ b/lm_eval/tasks/basqueglue/bhtc.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: bhtc_v2
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: bhtc
diff --git a/lm_eval/tasks/basqueglue/coref.yaml b/lm_eval/tasks/basqueglue/coref.yaml
index 6e9e8bde..2f64b192 100644
--- a/lm_eval/tasks/basqueglue/coref.yaml
+++ b/lm_eval/tasks/basqueglue/coref.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: epec_koref_bin
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: coref
diff --git a/lm_eval/tasks/basqueglue/qnli.yaml b/lm_eval/tasks/basqueglue/qnli.yaml
index 1a7d96ef..93dbece6 100644
--- a/lm_eval/tasks/basqueglue/qnli.yaml
+++ b/lm_eval/tasks/basqueglue/qnli.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: qnlieu
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: qnli
diff --git a/lm_eval/tasks/basqueglue/vaxx.yaml b/lm_eval/tasks/basqueglue/vaxx.yaml
index e58507fd..5d5ed632 100644
--- a/lm_eval/tasks/basqueglue/vaxx.yaml
+++ b/lm_eval/tasks/basqueglue/vaxx.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: vaxx_stance
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: vaxx
diff --git a/lm_eval/tasks/basqueglue/wic.yaml b/lm_eval/tasks/basqueglue/wic.yaml
index ee334043..e64ab694 100644
--- a/lm_eval/tasks/basqueglue/wic.yaml
+++ b/lm_eval/tasks/basqueglue/wic.yaml
@@ -1,4 +1,4 @@
-group: basque-glue
+tag: basque-glue
 task: wiceu
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: wic
diff --git a/lm_eval/tasks/bbh/README.md b/lm_eval/tasks/bbh/README.md
index 9ef2a4ab..b9b35c44 100644
--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -21,15 +21,19 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
+- `bbh`: is the same as `bbh_cot_fewshot`.
 - `bbh_zeroshot`
 - `bbh_fewshot`
 - `bbh_cot_fewshot`
 - `bbh_cot_zeroshot`
 
+#### Tags
+
+None.
 
 #### Tasks
 
diff --git a/lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml b/lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml
new file mode 100644
index 00000000..b0cba29a
--- /dev/null
+++ b/lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml
@@ -0,0 +1,36 @@
+group: bbh
+task:
+  - bbh_cot_fewshot_boolean_expressions
+  - bbh_cot_fewshot_causal_judgement
+  - bbh_cot_fewshot_date_understanding
+  - bbh_cot_fewshot_disambiguation_qa
+  - bbh_cot_fewshot_dyck_languages
+  - bbh_cot_fewshot_formal_fallacies
+  - bbh_cot_fewshot_geometric_shapes
+  - bbh_cot_fewshot_hyperbaton
+  - bbh_cot_fewshot_logical_deduction_five_objects
+  - bbh_cot_fewshot_logical_deduction_seven_objects
+  - bbh_cot_fewshot_logical_deduction_three_objects
+  - bbh_cot_fewshot_movie_recommendation
+  - bbh_cot_fewshot_multistep_arithmetic_two
+  - bbh_cot_fewshot_navigate
+  - bbh_cot_fewshot_object_counting
+  - bbh_cot_fewshot_penguins_in_a_table
+  - bbh_cot_fewshot_reasoning_about_colored_objects
+  - bbh_cot_fewshot_ruin_names
+  - bbh_cot_fewshot_salient_translation_error_detection
+  - bbh_cot_fewshot_snarks
+  - bbh_cot_fewshot_sports_understanding
+  - bbh_cot_fewshot_temporal_sequences
+  - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_fewshot_web_of_lies
+  - bbh_cot_fewshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+    filter_list: get-answer
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml b/lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml
new file mode 100644
index 00000000..baf703da
--- /dev/null
+++ b/lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml
@@ -0,0 +1,36 @@
+group: bbh_cot_fewshot
+task:
+  - bbh_cot_fewshot_boolean_expressions
+  - bbh_cot_fewshot_causal_judgement
+  - bbh_cot_fewshot_date_understanding
+  - bbh_cot_fewshot_disambiguation_qa
+  - bbh_cot_fewshot_dyck_languages
+  - bbh_cot_fewshot_formal_fallacies
+  - bbh_cot_fewshot_geometric_shapes
+  - bbh_cot_fewshot_hyperbaton
+  - bbh_cot_fewshot_logical_deduction_five_objects
+  - bbh_cot_fewshot_logical_deduction_seven_objects
+  - bbh_cot_fewshot_logical_deduction_three_objects
+  - bbh_cot_fewshot_movie_recommendation
+  - bbh_cot_fewshot_multistep_arithmetic_two
+  - bbh_cot_fewshot_navigate
+  - bbh_cot_fewshot_object_counting
+  - bbh_cot_fewshot_penguins_in_a_table
+  - bbh_cot_fewshot_reasoning_about_colored_objects
+  - bbh_cot_fewshot_ruin_names
+  - bbh_cot_fewshot_salient_translation_error_detection
+  - bbh_cot_fewshot_snarks
+  - bbh_cot_fewshot_sports_understanding
+  - bbh_cot_fewshot_temporal_sequences
+  - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_fewshot_web_of_lies
+  - bbh_cot_fewshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+    filter_list: get-answer
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
index 5f2318f8..2c86ffae 100644
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -1,6 +1,3 @@
-group:
-- bbh
-- bbh_cot_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test
diff --git a/lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml b/lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml
new file mode 100644
index 00000000..2a71e2d1
--- /dev/null
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml
@@ -0,0 +1,36 @@
+group: bbh_cot_zeroshot
+task:
+  - bbh_cot_zeroshot_boolean_expressions
+  - bbh_cot_zeroshot_causal_judgement
+  - bbh_cot_zeroshot_date_understanding
+  - bbh_cot_zeroshot_disambiguation_qa
+  - bbh_cot_zeroshot_dyck_languages
+  - bbh_cot_zeroshot_formal_fallacies
+  - bbh_cot_zeroshot_geometric_shapes
+  - bbh_cot_zeroshot_hyperbaton
+  - bbh_cot_zeroshot_logical_deduction_five_objects
+  - bbh_cot_zeroshot_logical_deduction_seven_objects
+  - bbh_cot_zeroshot_logical_deduction_three_objects
+  - bbh_cot_zeroshot_movie_recommendation
+  - bbh_cot_zeroshot_multistep_arithmetic_two
+  - bbh_cot_zeroshot_navigate
+  - bbh_cot_zeroshot_object_counting
+  - bbh_cot_zeroshot_penguins_in_a_table
+  - bbh_cot_zeroshot_reasoning_about_colored_objects
+  - bbh_cot_zeroshot_ruin_names
+  - bbh_cot_zeroshot_salient_translation_error_detection
+  - bbh_cot_zeroshot_snarks
+  - bbh_cot_zeroshot_sports_understanding
+  - bbh_cot_zeroshot_temporal_sequences
+  - bbh_cot_zeroshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_zeroshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_zeroshot_web_of_lies
+  - bbh_cot_zeroshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+    filter_list: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
index 5e62b0ba..4897fa2d 100644
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -1,4 +1,3 @@
-group: bbh_cot_zeroshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test
diff --git a/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml b/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml
new file mode 100644
index 00000000..480f98f6
--- /dev/null
+++ b/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml
@@ -0,0 +1,35 @@
+group: bbh_fewshot
+task:
+  - bbh_fewshot_boolean_expressions
+  - bbh_fewshot_causal_judgement
+  - bbh_fewshot_date_understanding
+  - bbh_fewshot_disambiguation_qa
+  - bbh_fewshot_dyck_languages
+  - bbh_fewshot_formal_fallacies
+  - bbh_fewshot_geometric_shapes
+  - bbh_fewshot_hyperbaton
+  - bbh_fewshot_logical_deduction_five_objects
+  - bbh_fewshot_logical_deduction_seven_objects
+  - bbh_fewshot_logical_deduction_three_objects
+  - bbh_fewshot_movie_recommendation
+  - bbh_fewshot_multistep_arithmetic_two
+  - bbh_fewshot_navigate
+  - bbh_fewshot_object_counting
+  - bbh_fewshot_penguins_in_a_table
+  - bbh_fewshot_reasoning_about_colored_objects
+  - bbh_fewshot_ruin_names
+  - bbh_fewshot_salient_translation_error_detection
+  - bbh_fewshot_snarks
+  - bbh_fewshot_sports_understanding
+  - bbh_fewshot_temporal_sequences
+  - bbh_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_fewshot_web_of_lies
+  - bbh_fewshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
index a764f723..638f3a09 100644
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -1,4 +1,3 @@
-group: bbh_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test
diff --git a/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml b/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
new file mode 100644
index 00000000..db2f9eb2
--- /dev/null
+++ b/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
@@ -0,0 +1,36 @@
+group: bbh_zeroshot
+task:
+  - bbh_zeroshot_boolean_expressions
+  - bbh_zeroshot_causal_judgement
+  - bbh_zeroshot_date_understanding
+  - bbh_zeroshot_disambiguation_qa
+  - bbh_zeroshot_dyck_languages
+  - bbh_zeroshot_formal_fallacies
+  - bbh_zeroshot_geometric_shapes
+  - bbh_zeroshot_hyperbaton
+  - bbh_zeroshot_logical_deduction_five_objects
+  - bbh_zeroshot_logical_deduction_seven_objects
+  - bbh_zeroshot_logical_deduction_three_objects
+  - bbh_zeroshot_movie_recommendation
+  - bbh_zeroshot_multistep_arithmetic_two
+  - bbh_zeroshot_navigate
+  - bbh_zeroshot_object_counting
+  - bbh_zeroshot_penguins_in_a_table
+  - bbh_zeroshot_reasoning_about_colored_objects
+  - bbh_zeroshot_ruin_names
+  - bbh_zeroshot_salient_translation_error_detection
+  - bbh_zeroshot_snarks
+  - bbh_zeroshot_sports_understanding
+  - bbh_zeroshot_temporal_sequences
+  - bbh_zeroshot_tracking_shuffled_objects_five_objects
+  - bbh_zeroshot_tracking_shuffled_objects_seven_objects
+  - bbh_zeroshot_tracking_shuffled_objects_three_objects
+  - bbh_zeroshot_web_of_lies
+  - bbh_zeroshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+    filter_list: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
index 728ccae1..7972c6c5 100644
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -1,4 +1,3 @@
-group: bbh_zeroshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test
diff --git a/lm_eval/tasks/belebele/_belebele.yaml b/lm_eval/tasks/belebele/_belebele.yaml
new file mode 100644
index 00000000..00376a6c
--- /dev/null
+++ b/lm_eval/tasks/belebele/_belebele.yaml
@@ -0,0 +1,133 @@
+group: belebele
+task:
+  - belebele_acm_Arab
+  - belebele_arz_Arab
+  - belebele_ceb_Latn
+  - belebele_fin_Latn
+  - belebele_hin_Deva
+  - belebele_ita_Latn
+  - belebele_khm_Khmr
+  - belebele_lvs_Latn
+  - belebele_npi_Deva
+  - belebele_pol_Latn
+  - belebele_slv_Latn
+  - belebele_swe_Latn
+  - belebele_tso_Latn
+  - belebele_xho_Latn
+  - belebele_afr_Latn
+  - belebele_asm_Beng
+  - belebele_ces_Latn
+  - belebele_fra_Latn
+  - belebele_hin_Latn
+  - belebele_jav_Latn
+  - belebele_kin_Latn
+  - belebele_mal_Mlym
+  - belebele_npi_Latn
+  - belebele_por_Latn
+  - belebele_sna_Latn
+  - belebele_swh_Latn
+  - belebele_tur_Latn
+  - belebele_yor_Latn
+  - belebele_als_Latn
+  - belebele_azj_Latn
+  - belebele_ckb_Arab
+  - belebele_fuv_Latn
+  - belebele_hrv_Latn
+  - belebele_jpn_Jpan
+  - belebele_kir_Cyrl
+  - belebele_mar_Deva
+  - belebele_nso_Latn
+  - belebele_snd_Arab
+  - belebele_tam_Taml
+  - belebele_ukr_Cyrl
+  - belebele_zho_Hans
+  - belebele_amh_Ethi
+  - belebele_bam_Latn
+  - belebele_dan_Latn
+  - belebele_gaz_Latn
+  - belebele_hun_Latn
+  - belebele_kac_Latn
+  - belebele_kor_Hang
+  - belebele_mkd_Cyrl
+  - belebele_nya_Latn
+  - belebele_ron_Latn
+  - belebele_som_Latn
+  - belebele_tel_Telu
+  - belebele_urd_Arab
+  - belebele_zho_Hant
+  - belebele_apc_Arab
+  - belebele_ben_Beng
+  - belebele_deu_Latn
+  - belebele_grn_Latn
+  - belebele_hye_Armn
+  - belebele_kan_Knda
+  - belebele_lao_Laoo
+  - belebele_mlt_Latn
+  - belebele_ory_Orya
+  - belebele_rus_Cyrl
+  - belebele_sot_Latn
+  - belebele_tgk_Cyrl
+  - belebele_urd_Latn
+  - belebele_zsm_Latn
+  - belebele_arb_Arab
+  - belebele_ben_Latn
+  - belebele_ell_Grek
+  - belebele_guj_Gujr
+  - belebele_ibo_Latn
+  - belebele_kat_Geor
+  - belebele_lin_Latn
+  - belebele_mri_Latn
+  - belebele_pan_Guru
+  - belebele_shn_Mymr
+  - belebele_spa_Latn
+  - belebele_tgl_Latn
+  - belebele_uzn_Latn
+  - belebele_zul_Latn
+  - belebele_arb_Latn
+  - belebele_bod_Tibt
+  - belebele_eng_Latn
+  - belebele_hat_Latn
+  - belebele_ilo_Latn
+  - belebele_kaz_Cyrl
+  - belebele_lit_Latn
+  - belebele_mya_Mymr
+  - belebele_pbt_Arab
+  - belebele_sin_Latn
+  - belebele_srp_Cyrl
+  - belebele_tha_Thai
+  - belebele_vie_Latn
+  - belebele_ars_Arab
+  - belebele_bul_Cyrl
+  - belebele_est_Latn
+  - belebele_hau_Latn
+  - belebele_ind_Latn
+  - belebele_kea_Latn
+  - belebele_lug_Latn
+  - belebele_nld_Latn
+  - belebele_pes_Arab
+  - belebele_sin_Sinh
+  - belebele_ssw_Latn
+  - belebele_tir_Ethi
+  - belebele_war_Latn
+  - belebele_ary_Arab
+  - belebele_cat_Latn
+  - belebele_eus_Latn
+  - belebele_heb_Hebr
+  - belebele_isl_Latn
+  - belebele_khk_Cyrl
+  - belebele_luo_Latn
+  - belebele_nob_Latn
+  - belebele_plt_Latn
+  - belebele_slk_Latn
+  - belebele_sun_Latn
+  - belebele_tsn_Latn
+  - belebele_wol_Latn
+aggregate_metric_list:
+  - aggregation: mean
+    metric: acc
+    weight_by_size: true
+  - aggregation: mean
+    metric: acc_norm
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
index 2583ced5..901feb77 100644
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -1,4 +1,3 @@
-group: belebele
 dataset_path: facebook/belebele
 fewshot_config:
   sampler: first_n
diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
index c21b51f9..f4bf589a 100644
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -65,3 +65,36 @@ if __name__ == "__main__":
                 allow_unicode=True,
                 default_style='"',
             )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": f"belebele_{args.task_prefix}"
+        if args.task_prefix != ""
+        else "belebele",
+        "task": [
+            (
+                f"belebele_{args.task_prefix}_{lang}"
+                if args.task_prefix != ""
+                else f"belebele_{lang}"
+            )
+            for lang in languages
+            if "default" not in lang
+        ],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": False},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": False},
+        ],
+        "metadata": {"version": 0.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + f"{args.task_prefix}.yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
diff --git a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
index 57967135..c57d2654 100644
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
@@ -4,48 +4,51 @@ task:
   # ANLI R1
   - group: anli_r1_flan
     group_alias: ANLI R1
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: anli_r1
+      - task: anli_r1_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-7
         task_alias: prompt-7
         include: _held_in_template_yaml
         doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-8
         task_alias: prompt-8
         include: _held_in_template_yaml
         doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -53,48 +56,51 @@ task:
   # ANLI R2
   - group: anli_r2_flan
     group_alias: ANLI R2
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: anli_r2
+      - task: anli_r2_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-7
         task_alias: prompt-7
         include: _held_in_template_yaml
         doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-8
         task_alias: prompt-8
         include: _held_in_template_yaml
         doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -102,48 +108,51 @@ task:
   # ANLI R3
   - group: anli_r3_flan
     group_alias: ANLI R3
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: anli_r3
+      - task: anli_r3_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-7
         task_alias: prompt-7
         include: _held_in_template_yaml
         doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
         doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-8
         task_alias: prompt-8
         include: _held_in_template_yaml
         doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -151,38 +160,41 @@ task:
   # Arc Easy
   - group: arc_easy_flan
     group_alias: Arc Easy
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: arc_easy
+      - task: arc_easy_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -190,38 +202,41 @@ task:
   # Arc Challenge
   - group: arc_challenge_flan
     group_alias: Arc Challenge
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: arc_challenge
+      - task: arc_challenge_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
         doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -229,53 +244,56 @@ task:
   # BoolQ
   - group: boolq_flan
     group_alias: BoolQ
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: boolq
+      - task: boolq_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-2
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-7
         task_alias: prompt-7
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-8
         task_alias: prompt-8
         include: _held_in_template_yaml
         doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
         doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-9
         task_alias: prompt-9
         include: _held_in_template_yaml
         doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
@@ -283,48 +301,51 @@ task:
   # RTE
   - group: rte_flan
     group_alias: RTE
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
     task:
-      - task: rte
+      - task: rte_prompt-0
         task_alias: prompt-0
         include: _held_in_template_yaml
         doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
         task_alias: prompt-1
         include: _held_in_template_yaml
         doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
         task_alias: prompt-2
         include: _held_in_template_yaml
         doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-3
         task_alias: prompt-3
         include: _held_in_template_yaml
         doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-4
         task_alias: prompt-4
         include: _held_in_template_yaml
         doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-5
         task_alias: prompt-5
         include: _held_in_template_yaml
         doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-6
         task_alias: prompt-6
         include: _held_in_template_yaml
         doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-7
         task_alias: prompt-7
         include: _held_in_template_yaml
         doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
         doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-8
         task_alias: prompt-8
         include: _held_in_template_yaml
         doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
diff --git a/lm_eval/tasks/benchmarks/minerva_math.yaml b/lm_eval/tasks/benchmarks/minerva_math.yaml
index 6df3203e..f1ec09f7 100644
--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -7,3 +7,9 @@ task:
   - minerva_math_num_theory
   - minerva_math_prealgebra
   - minerva_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
index 29810bb4..7a8409e4 100644
--- a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
@@ -15,3 +15,7 @@ task:
     task_alias: "professional_medicine (mmlu)"
   - task: mmlu_college_biology
     task_alias: "college_biology (mmlu)"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
diff --git a/lm_eval/tasks/bertaqa/_bertaqa_template b/lm_eval/tasks/bertaqa/_bertaqa_template
index 80cb3935..07454d09 100644
--- a/lm_eval/tasks/bertaqa/_bertaqa_template
+++ b/lm_eval/tasks/bertaqa/_bertaqa_template
@@ -1,4 +1,4 @@
-group: bertaqa
+tag: bertaqa
 dataset_path: HiTZ/BertaQA
 dataset_name: null
 validation_split: null
diff --git a/lm_eval/tasks/blimp/_blimp.yaml b/lm_eval/tasks/blimp/_blimp.yaml
new file mode 100644
index 00000000..6393eead
--- /dev/null
+++ b/lm_eval/tasks/blimp/_blimp.yaml
@@ -0,0 +1,75 @@
+group: blimp
+task:
+  - "blimp_adjunct_island"
+  - "blimp_anaphor_gender_agreement"
+  - "blimp_anaphor_number_agreement"
+  - "blimp_animate_subject_passive"
+  - "blimp_animate_subject_trans"
+  - "blimp_causative"
+  - "blimp_complex_NP_island"
+  - "blimp_coordinate_structure_constraint_complex_left_branch"
+  - "blimp_coordinate_structure_constraint_object_extraction"
+  - "blimp_determiner_noun_agreement_1"
+  - "blimp_determiner_noun_agreement_2"
+  - "blimp_determiner_noun_agreement_irregular_1"
+  - "blimp_determiner_noun_agreement_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adj_2"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_1"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adjective_1"
+  - "blimp_distractor_agreement_relational_noun"
+  - "blimp_distractor_agreement_relative_clause"
+  - "blimp_drop_argument"
+  - "blimp_ellipsis_n_bar_1"
+  - "blimp_ellipsis_n_bar_2"
+  - "blimp_existential_there_object_raising"
+  - "blimp_existential_there_quantifiers_1"
+  - "blimp_existential_there_quantifiers_2"
+  - "blimp_existential_there_subject_raising"
+  - "blimp_expletive_it_object_raising"
+  - "blimp_inchoative"
+  - "blimp_intransitive"
+  - "blimp_irregular_past_participle_adjectives"
+  - "blimp_irregular_past_participle_verbs"
+  - "blimp_irregular_plural_subject_verb_agreement_1"
+  - "blimp_irregular_plural_subject_verb_agreement_2"
+  - "blimp_left_branch_island_echo_question"
+  - "blimp_left_branch_island_simple_question"
+  - "blimp_matrix_question_npi_licensor_present"
+  - "blimp_npi_present_1"
+  - "blimp_npi_present_2"
+  - "blimp_only_npi_licensor_present"
+  - "blimp_only_npi_scope"
+  - "blimp_passive_1"
+  - "blimp_passive_2"
+  - "blimp_principle_A_c_command"
+  - "blimp_principle_A_case_1"
+  - "blimp_principle_A_case_2"
+  - "blimp_principle_A_domain_1"
+  - "blimp_principle_A_domain_2"
+  - "blimp_principle_A_domain_3"
+  - "blimp_principle_A_reconstruction"
+  - "blimp_regular_plural_subject_verb_agreement_1"
+  - "blimp_regular_plural_subject_verb_agreement_2"
+  - "blimp_sentential_negation_npi_licensor_present"
+  - "blimp_sentential_negation_npi_scope"
+  - "blimp_sentential_subject_island"
+  - "blimp_superlative_quantifiers_1"
+  - "blimp_superlative_quantifiers_2"
+  - "blimp_tough_vs_raising_1"
+  - "blimp_tough_vs_raising_2"
+  - "blimp_transitive"
+  - "blimp_wh_island"
+  - "blimp_wh_questions_object_gap"
+  - "blimp_wh_questions_subject_gap"
+  - "blimp_wh_questions_subject_gap_long_distance"
+  - "blimp_wh_vs_that_no_gap"
+  - "blimp_wh_vs_that_no_gap_long_distance"
+  - "blimp_wh_vs_that_with_gap"
+  - "blimp_wh_vs_that_with_gap_long_distance"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: False
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/blimp/_template_yaml b/lm_eval/tasks/blimp/_template_yaml
index fb1dd313..f81e7938 100644
--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
@@ -1,4 +1,3 @@
-group: blimp
 dataset_path: blimp
 output_type: multiple_choice
 validation_split: train
diff --git a/lm_eval/tasks/ceval/_ceval-valid.yaml b/lm_eval/tasks/ceval/_ceval-valid.yaml
new file mode 100644
index 00000000..f5b9140c
--- /dev/null
+++ b/lm_eval/tasks/ceval/_ceval-valid.yaml
@@ -0,0 +1,63 @@
+aggregate_metric_list:
+- aggregation: mean
+  metric: acc
+  weight_by_size: true
+- aggregation: mean
+  metric: acc_norm
+  weight_by_size: true
+group: ceval-valid
+metadata:
+  version: 1.0
+task:
+  - ceval-valid_computer_network
+  - ceval-valid_operating_system
+  - ceval-valid_computer_architecture
+  - ceval-valid_college_programming
+  - ceval-valid_college_physics
+  - ceval-valid_college_chemistry
+  - ceval-valid_advanced_mathematics
+  - ceval-valid_probability_and_statistics
+  - ceval-valid_discrete_mathematics
+  - ceval-valid_electrical_engineer
+  - ceval-valid_metrology_engineer
+  - ceval-valid_high_school_mathematics
+  - ceval-valid_high_school_physics
+  - ceval-valid_high_school_chemistry
+  - ceval-valid_high_school_biology
+  - ceval-valid_middle_school_mathematics
+  - ceval-valid_middle_school_biology
+  - ceval-valid_middle_school_physics
+  - ceval-valid_middle_school_chemistry
+  - ceval-valid_veterinary_medicine
+  - ceval-valid_college_economics
+  - ceval-valid_business_administration
+  - ceval-valid_marxism
+  - ceval-valid_mao_zedong_thought
+  - ceval-valid_education_science
+  - ceval-valid_teacher_qualification
+  - ceval-valid_high_school_politics
+  - ceval-valid_high_school_geography
+  - ceval-valid_middle_school_politics
+  - ceval-valid_middle_school_geography
+  - ceval-valid_modern_chinese_history
+  - ceval-valid_ideological_and_moral_cultivation
+  - ceval-valid_logic
+  - ceval-valid_law
+  - ceval-valid_chinese_language_and_literature
+  - ceval-valid_art_studies
+  - ceval-valid_professional_tour_guide
+  - ceval-valid_legal_professional
+  - ceval-valid_high_school_chinese
+  - ceval-valid_high_school_history
+  - ceval-valid_middle_school_history
+  - ceval-valid_civil_servant
+  - ceval-valid_sports_science
+  - ceval-valid_plant_protection
+  - ceval-valid_basic_medicine
+  - ceval-valid_clinical_medicine
+  - ceval-valid_urban_and_rural_planner
+  - ceval-valid_accountant
+  - ceval-valid_fire_engineer
+  - ceval-valid_environmental_impact_assessment_engineer
+  - ceval-valid_tax_accountant
+  - ceval-valid_physician
diff --git a/lm_eval/tasks/ceval/_default_ceval_yaml b/lm_eval/tasks/ceval/_default_ceval_yaml
index a94d87cb..da106c59 100644
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
@@ -1,4 +1,3 @@
-group: ceval-valid
 dataset_path: ceval/ceval-exam
 validation_split: val
 fewshot_split: dev
diff --git a/lm_eval/tasks/ceval/_generate_configs.py b/lm_eval/tasks/ceval/_generate_configs.py
index b5eb1a5b..9050c75c 100644
--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -8,7 +8,7 @@ import os
 import yaml
 from tqdm import tqdm
 
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger
 
 
 SUBJECTS = {
@@ -117,3 +117,26 @@ if __name__ == "__main__":
                 allow_unicode=True,
                 default_style='"',
             )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": "ceval-valid",
+        "task": [f"ceval-valid_{task_name}" for task_name in SUBJECTS.keys()],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": True},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
+        ],
+        "metadata": {"version": 1.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + ".yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
diff --git a/lm_eval/tasks/cmmlu/_cmmlu.yaml b/lm_eval/tasks/cmmlu/_cmmlu.yaml
new file mode 100644
index 00000000..c53c92df
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/_cmmlu.yaml
@@ -0,0 +1,78 @@
+group: cmmlu
+task:
+  - cmmlu_agronomy
+  - cmmlu_anatomy
+  - cmmlu_ancient_chinese
+  - cmmlu_arts
+  - cmmlu_astronomy
+  - cmmlu_business_ethics
+  - cmmlu_chinese_civil_service_exam
+  - cmmlu_chinese_driving_rule
+  - cmmlu_chinese_food_culture
+  - cmmlu_chinese_foreign_policy
+  - cmmlu_chinese_history
+  - cmmlu_chinese_literature
+  - cmmlu_chinese_teacher_qualification
+  - cmmlu_clinical_knowledge
+  - cmmlu_college_actuarial_science
+  - cmmlu_college_education
+  - cmmlu_college_engineering_hydrology
+  - cmmlu_college_law
+  - cmmlu_college_mathematics
+  - cmmlu_college_medical_statistics
+  - cmmlu_college_medicine
+  - cmmlu_computer_science
+  - cmmlu_computer_security
+  - cmmlu_conceptual_physics
+  - cmmlu_construction_project_management
+  - cmmlu_economics
+  - cmmlu_education
+  - cmmlu_electrical_engineering
+  - cmmlu_elementary_chinese
+  - cmmlu_elementary_commonsense
+  - cmmlu_elementary_information_and_technology
+  - cmmlu_elementary_mathematics
+  - cmmlu_ethnology
+  - cmmlu_food_science
+  - cmmlu_genetics
+  - cmmlu_global_facts
+  - cmmlu_high_school_biology
+  - cmmlu_high_school_chemistry
+  - cmmlu_high_school_geography
+  - cmmlu_high_school_mathematics
+  - cmmlu_high_school_physics
+  - cmmlu_high_school_politics
+  - cmmlu_human_sexuality
+  - cmmlu_international_law
+  - cmmlu_journalism
+  - cmmlu_jurisprudence
+  - cmmlu_legal_and_moral_basis
+  - cmmlu_logical
+  - cmmlu_machine_learning
+  - cmmlu_management
+  - cmmlu_marketing
+  - cmmlu_marxist_theory
+  - cmmlu_modern_chinese
+  - cmmlu_nutrition
+  - cmmlu_philosophy
+  - cmmlu_professional_accounting
+  - cmmlu_professional_law
+  - cmmlu_professional_medicine
+  - cmmlu_professional_psychology
+  - cmmlu_public_relations
+  - cmmlu_security_study
+  - cmmlu_sociology
+  - cmmlu_sports_science
+  - cmmlu_traditional_chinese_medicine
+  - cmmlu_virology
+  - cmmlu_world_history
+  - cmmlu_world_religions
+aggregate_metric_list:
+  - aggregation: mean
+    metric: acc
+    weight_by_size: true
+  - aggregation: mean
+    metric: acc_norm
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/cmmlu/_default_template_yaml b/lm_eval/tasks/cmmlu/_default_template_yaml
index d2e0a887..2980cbb1 100644
--- a/lm_eval/tasks/cmmlu/_default_template_yaml
+++ b/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -1,4 +1,3 @@
-group: cmmlu
 dataset_path: haonan-li/cmmlu
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/cmmlu/_generate_configs.py b/lm_eval/tasks/cmmlu/_generate_configs.py
index 2c749b69..f1b60e28 100644
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -132,3 +132,33 @@ if __name__ == "__main__":
                 allow_unicode=True,
                 default_style='"',
             )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": "cmmlu",
+        "task": [
+            (
+                f"cmmlu_{args.task_prefix}_{subject_eng}"
+                if args.task_prefix != ""
+                else f"cmmlu_{subject_eng}"
+            )
+            for subject_eng in SUBJECTS.keys()
+        ],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": True},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
+        ],
+        "metadata": {"version": 0.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + ".yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
diff --git a/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml b/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml
new file mode 100644
index 00000000..034ce661
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "agronomy"
+"description": "以下是关于农学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_agronomy"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml b/lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml
new file mode 100644
index 00000000..84161ec3
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "anatomy"
+"description": "以下是关于解剖学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_anatomy"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml b/lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml
new file mode 100644
index 00000000..2099e0a1
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ancient_chinese"
+"description": "以下是关于古汉语的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_ancient_chinese"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_arts.yaml b/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
new file mode 100644
index 00000000..6007825c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arts"
+"description": "以下是关于艺术学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_arts"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml b/lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml
new file mode 100644
index 00000000..5ad9a8f2
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "astronomy"
+"description": "以下是关于天文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_astronomy"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml b/lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml
new file mode 100644
index 00000000..10135b60
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "business_ethics"
+"description": "以下是关于商业伦理的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_business_ethics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml
new file mode 100644
index 00000000..dcf6c7e6
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_civil_service_exam"
+"description": "以下是关于中国公务员考试的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_civil_service_exam"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml
new file mode 100644
index 00000000..2dce17c0
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_driving_rule"
+"description": "以下是关于中国驾驶规则的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_driving_rule"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml
new file mode 100644
index 00000000..52400c56
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_food_culture"
+"description": "以下是关于中国饮食文化的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_food_culture"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml
new file mode 100644
index 00000000..bbb34a96
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_foreign_policy"
+"description": "以下是关于中国外交政策的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_foreign_policy"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml
new file mode 100644
index 00000000..26fe1db5
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_history"
+"description": "以下是关于中国历史的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_history"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_literature.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_literature.yaml
new file mode 100644
index 00000000..4166c122
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_literature.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_literature"
+"description": "以下是关于中国文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_literature"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml b/lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml
new file mode 100644
index 00000000..226e98a9
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_teacher_qualification"
+"description": "以下是关于中国教师资格的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_teacher_qualification"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_clinical_knowledge.yaml b/lm_eval/tasks/cmmlu/cmmlu_clinical_knowledge.yaml
new file mode 100644
index 00000000..6ed36425
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_clinical_knowledge.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "clinical_knowledge"
+"description": "以下是关于临床知识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_clinical_knowledge"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
new file mode 100644
index 00000000..3543486b
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_actuarial_science"
+"description": "以下是关于大学精算学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_actuarial_science"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_education.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_education.yaml
new file mode 100644
index 00000000..952f351c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_education.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_education"
+"description": "以下是关于大学教育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_education"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
new file mode 100644
index 00000000..d52288a4
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_engineering_hydrology"
+"description": "以下是关于大学工程水文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_engineering_hydrology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_law.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_law.yaml
new file mode 100644
index 00000000..717784ac
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_law"
+"description": "以下是关于大学法律的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_law"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
new file mode 100644
index 00000000..7876a584
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_mathematics"
+"description": "以下是关于大学数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_mathematics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
new file mode 100644
index 00000000..f4839bdc
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medical_statistics"
+"description": "以下是关于大学医学统计的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_medical_statistics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml b/lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml
new file mode 100644
index 00000000..dff0d786
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medicine"
+"description": "以下是关于大学医学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_medicine"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml b/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
new file mode 100644
index 00000000..86c874e5
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_science"
+"description": "以下是关于计算机科学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_science"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml b/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
new file mode 100644
index 00000000..94715461
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "以下是关于计算机安全的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_security"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml b/lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml
new file mode 100644
index 00000000..ba952486
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "conceptual_physics"
+"description": "以下是关于概念物理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_conceptual_physics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml b/lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml
new file mode 100644
index 00000000..86265b08
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "construction_project_management"
+"description": "以下是关于建设工程管理的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_construction_project_management"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_economics.yaml b/lm_eval/tasks/cmmlu/cmmlu_economics.yaml
new file mode 100644
index 00000000..4602efb4
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_economics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "economics"
+"description": "以下是关于经济学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_economics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_education.yaml b/lm_eval/tasks/cmmlu/cmmlu_education.yaml
new file mode 100644
index 00000000..f1dc8a8a
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_education.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "education"
+"description": "以下是关于教育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_education"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_electrical_engineering.yaml b/lm_eval/tasks/cmmlu/cmmlu_electrical_engineering.yaml
new file mode 100644
index 00000000..2bb920b5
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_electrical_engineering.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "electrical_engineering"
+"description": "以下是关于电气工程的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_electrical_engineering"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml b/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
new file mode 100644
index 00000000..6f67be3f
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_chinese"
+"description": "以下是关于小学语文的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_chinese"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml b/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
new file mode 100644
index 00000000..3017edd9
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_commonsense"
+"description": "以下是关于小学常识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_commonsense"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml b/lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml
new file mode 100644
index 00000000..98c7d3c8
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_information_and_technology"
+"description": "以下是关于小学信息技术的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_information_and_technology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml b/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
new file mode 100644
index 00000000..f702312c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "以下是关于初等数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_mathematics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml b/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
new file mode 100644
index 00000000..88a653a9
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ethnology"
+"description": "以下是关于民族学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_ethnology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_food_science.yaml b/lm_eval/tasks/cmmlu/cmmlu_food_science.yaml
new file mode 100644
index 00000000..9be450ca
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_food_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "food_science"
+"description": "以下是关于食品科学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_food_science"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml b/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
new file mode 100644
index 00000000..be57628b
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "genetics"
+"description": "以下是关于遗传学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_genetics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml b/lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml
new file mode 100644
index 00000000..6982be94
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "global_facts"
+"description": "以下是关于全球事实的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_global_facts"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_biology.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_biology.yaml
new file mode 100644
index 00000000..a56e7974
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_biology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_biology"
+"description": "以下是关于高中生物的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_biology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml
new file mode 100644
index 00000000..34e99ea0
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_chemistry"
+"description": "以下是关于高中化学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_chemistry"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_geography.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_geography.yaml
new file mode 100644
index 00000000..c73ebe91
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_geography.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_geography"
+"description": "以下是关于高中地理的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_geography"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
new file mode 100644
index 00000000..3598501c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_mathematics"
+"description": "以下是关于高中数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_mathematics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml
new file mode 100644
index 00000000..676fca16
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_physics"
+"description": "以下是关于高中物理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_physics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml b/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
new file mode 100644
index 00000000..5f689dff
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_politics"
+"description": "以下是关于高中政治的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_politics"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml b/lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml
new file mode 100644
index 00000000..39ff32e7
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "human_sexuality"
+"description": "以下是关于人类性行为的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_human_sexuality"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml b/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
new file mode 100644
index 00000000..32112d3c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "international_law"
+"description": "以下是关于国际法学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_international_law"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_journalism.yaml b/lm_eval/tasks/cmmlu/cmmlu_journalism.yaml
new file mode 100644
index 00000000..9f283816
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_journalism.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "journalism"
+"description": "以下是关于新闻学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_journalism"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml b/lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml
new file mode 100644
index 00000000..ab40da40
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "jurisprudence"
+"description": "以下是关于法理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_jurisprudence"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml b/lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml
new file mode 100644
index 00000000..a5e3ee13
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "legal_and_moral_basis"
+"description": "以下是关于法律与道德基础的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_legal_and_moral_basis"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_logical.yaml b/lm_eval/tasks/cmmlu/cmmlu_logical.yaml
new file mode 100644
index 00000000..4c4ac2e1
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_logical.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "logical"
+"description": "以下是关于逻辑学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_logical"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml b/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
new file mode 100644
index 00000000..062cd1cd
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "machine_learning"
+"description": "以下是关于机器学习的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_machine_learning"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_management.yaml b/lm_eval/tasks/cmmlu/cmmlu_management.yaml
new file mode 100644
index 00000000..aa5681ba
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_management.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "management"
+"description": "以下是关于管理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_management"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_marketing.yaml b/lm_eval/tasks/cmmlu/cmmlu_marketing.yaml
new file mode 100644
index 00000000..a729641f
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_marketing.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marketing"
+"description": "以下是关于市场营销的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_marketing"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml b/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
new file mode 100644
index 00000000..f99fa175
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marxist_theory"
+"description": "以下是关于马克思主义理论的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_marxist_theory"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml b/lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml
new file mode 100644
index 00000000..13b2ccc4
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "modern_chinese"
+"description": "以下是关于现代汉语的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_modern_chinese"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml b/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
new file mode 100644
index 00000000..23d52c45
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nutrition"
+"description": "以下是关于营养学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_nutrition"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml b/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
new file mode 100644
index 00000000..17340fa4
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "以下是关于哲学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_philosophy"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml b/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml
new file mode 100644
index 00000000..bed3485d
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_accounting"
+"description": "以下是关于专业会计的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_accounting"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml b/lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml
new file mode 100644
index 00000000..dec4c6cf
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_law"
+"description": "以下是关于专业法学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_law"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml b/lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml
new file mode 100644
index 00000000..92fed45e
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_medicine"
+"description": "以下是关于专业医学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_medicine"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml b/lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml
new file mode 100644
index 00000000..83f02555
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_psychology"
+"description": "以下是关于专业心理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_psychology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml b/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml
new file mode 100644
index 00000000..a1c3711e
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "public_relations"
+"description": "以下是关于公共关系的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_public_relations"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml b/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml
new file mode 100644
index 00000000..c9660f04
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_study"
+"description": "以下是关于安全研究的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_security_study"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_sociology.yaml b/lm_eval/tasks/cmmlu/cmmlu_sociology.yaml
new file mode 100644
index 00000000..54f4367d
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_sociology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sociology"
+"description": "以下是关于社会学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_sociology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml b/lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml
new file mode 100644
index 00000000..35e5bb9c
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sports_science"
+"description": "以下是关于体育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_sports_science"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml b/lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml
new file mode 100644
index 00000000..ed4627de
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "traditional_chinese_medicine"
+"description": "以下是关于中医中药的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_traditional_chinese_medicine"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_virology.yaml b/lm_eval/tasks/cmmlu/cmmlu_virology.yaml
new file mode 100644
index 00000000..1560b84f
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_virology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "virology"
+"description": "以下是关于病毒学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_virology"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml b/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml
new file mode 100644
index 00000000..993ce0ab
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_history"
+"description": "以下是关于世界历史的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_world_history"
diff --git a/lm_eval/tasks/cmmlu/cmmlu_world_religions.yaml b/lm_eval/tasks/cmmlu/cmmlu_world_religions.yaml
new file mode 100644
index 00000000..13229fc9
--- /dev/null
+++ b/lm_eval/tasks/cmmlu/cmmlu_world_religions.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_religions"
+"description": "以下是关于世界宗教的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_world_religions"
diff --git a/lm_eval/tasks/copal_id/standard.yaml b/lm_eval/tasks/copal_id/standard.yaml
index d38220fc..a09d0c85 100644
--- a/lm_eval/tasks/copal_id/standard.yaml
+++ b/lm_eval/tasks/copal_id/standard.yaml
@@ -1,4 +1,4 @@
-group: copal_id
+tag: copal_id
 task: copal_id_standard
 task_alias: standard
 dataset_path: haryoaw/COPAL
diff --git a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
index d95c83d0..3daf5f75 100644
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
@@ -1,7 +1,5 @@
-group:
+tag:
   - crows_pairs
-  - social_bias
-  - loglikelihood
 task: crows_pairs_english
 dataset_path: BigScienceBiasEval/crows_pairs_multilingual
 dataset_name: english
diff --git a/lm_eval/tasks/csatqa/_csatqa.yaml b/lm_eval/tasks/csatqa/_csatqa.yaml
new file mode 100644
index 00000000..8b9e3013
--- /dev/null
+++ b/lm_eval/tasks/csatqa/_csatqa.yaml
@@ -0,0 +1,17 @@
+group: csatqa
+task:
+  - csatqa_gr
+  - csatqa_li
+  - csatqa_rch
+  - csatqa_rcs
+  - csatqa_rcss
+  - csatqa_wr
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/csatqa/_default_csatqa_yaml b/lm_eval/tasks/csatqa/_default_csatqa_yaml
index a4a5db84..f9039cc1 100644
--- a/lm_eval/tasks/csatqa/_default_csatqa_yaml
+++ b/lm_eval/tasks/csatqa/_default_csatqa_yaml
@@ -1,4 +1,3 @@
-group: csatqa
 dataset_path: EleutherAI/csatqa
 test_split: test
 output_type: multiple_choice
diff --git a/lm_eval/tasks/fda/task.py b/lm_eval/tasks/fda/task.py
index a61feaed..1b01c2bc 100644
--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
@@ -12,7 +12,7 @@ class FDA(ConfigurableTask):
     DATASET_PATH = "hazyresearch/based-fda"
     DATASET_NAME = "default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(config={"metadata": {"version": self.VERSION}})
 
     def has_training_docs(self):
diff --git a/lm_eval/tasks/fld/fld_default.yaml b/lm_eval/tasks/fld/fld_default.yaml
index 233a3564..460a9ec6 100644
--- a/lm_eval/tasks/fld/fld_default.yaml
+++ b/lm_eval/tasks/fld/fld_default.yaml
@@ -1,5 +1,3 @@
-group:
-  - fld
 task: fld_default
 dataset_path: hitachi-nlp/FLD.v2
 dataset_name: default
diff --git a/lm_eval/tasks/french_bench/README.md b/lm_eval/tasks/french_bench/README.md
index 8b2a22ed..693f60c3 100644
--- a/lm_eval/tasks/french_bench/README.md
+++ b/lm_eval/tasks/french_bench/README.md
@@ -20,9 +20,9 @@ This benchmark is constructed both from openly available datasets, as well as ne
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
-#### Groups
+#### Tags
 
 - `french_bench`: All tasks (non-perplexity based)
 - `french_bench_gen`: All official generative tasks
diff --git a/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml b/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml
index a77d5163..7884b0dc 100644
--- a/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - french_bench
   - french_bench_mc
 task: french_bench_arc_challenge
diff --git a/lm_eval/tasks/french_bench/french_bench_boolqa.yaml b/lm_eval/tasks/french_bench/french_bench_boolqa.yaml
index ed67265d..bdd60e5d 100644
--- a/lm_eval/tasks/french_bench/french_bench_boolqa.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_boolqa.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?"
diff --git a/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml b/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml
index 5ffdb194..2e49ec43 100644
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
diff --git a/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml b/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml
index 7fe89c31..4e07d2ec 100644
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "D'après l'information présente dans le contexte, est il possible de répondre à la question ?"
diff --git a/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml b/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml
index bd1c4684..38051852 100644
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_gen
 description: "D'après l'information dans le contexte donné, quelle question a été posée pour obtenir la réponse donnée ?"
diff --git a/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml b/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml
index 37c02af3..6eedbabb 100644
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_gen
 description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
diff --git a/lm_eval/tasks/french_bench/french_bench_grammar.yaml b/lm_eval/tasks/french_bench/french_bench_grammar.yaml
index 45052ccc..6828c3a9 100644
--- a/lm_eval/tasks/french_bench/french_bench_grammar.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_grammar.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_mc
 description: "Répond au mieux en complétant la question avec une des réponses proposées."
diff --git a/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml b/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml
index 9fa8ea26..293a76c2 100644
--- a/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - french_bench
   - french_bench_mc
 task: french_bench_hellaswag
diff --git a/lm_eval/tasks/french_bench/french_bench_multifquad.yaml b/lm_eval/tasks/french_bench/french_bench_multifquad.yaml
index 632ffe36..71301bf2 100644
--- a/lm_eval/tasks/french_bench/french_bench_multifquad.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_multifquad.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_gen
 description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques extraits du contexte."
diff --git a/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml b/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml
index c5a72501..dbe714a9 100644
--- a/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - french_bench_perplexity
 task: french_bench_opus_perplexity
 dataset_path: manu/opus100-en-fr
diff --git a/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml b/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml
index 3ca8888a..1d4a3b4a 100644
--- a/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_gen
 description: "Résume l'article en une phrase."
diff --git a/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml b/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml
index c459a18f..90b547e0 100644
--- a/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "Trouve le titre de l'article."
diff --git a/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml b/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml
index 8d8c8abd..f3abeada 100644
--- a/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 # description: "Répond au mieux en complétant la question avec une des réponses proposées."
diff --git a/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml b/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml
index c88957a1..28dd6af6 100644
--- a/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?"
diff --git a/lm_eval/tasks/french_bench/french_bench_trivia.yaml b/lm_eval/tasks/french_bench/french_bench_trivia.yaml
index 525fb781..3b69b0f1 100644
--- a/lm_eval/tasks/french_bench/french_bench_trivia.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_trivia.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_gen
 task: french_bench_trivia
diff --git a/lm_eval/tasks/french_bench/french_bench_vocab.yaml b/lm_eval/tasks/french_bench/french_bench_vocab.yaml
index 1995c91c..a5d5cadc 100644
--- a/lm_eval/tasks/french_bench/french_bench_vocab.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_vocab.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_mc
 # description: "Répond au mieux en complétant la question avec une des réponses proposées."
diff --git a/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml b/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml
index c4b04fe0..d7ae23ff 100644
--- a/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - french_bench_perplexity
 task: french_bench_wikitext_fr
 dataset_path: asi/wikitext_fr
diff --git a/lm_eval/tasks/french_bench/french_bench_xnli.yaml b/lm_eval/tasks/french_bench/french_bench_xnli.yaml
index 7a527e4c..272b5652 100644
--- a/lm_eval/tasks/french_bench/french_bench_xnli.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_xnli.yaml
@@ -1,5 +1,5 @@
 include: "_default_template_yaml"
-group:
+tag:
   - french_bench
   - french_bench_extra
 description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?"
diff --git a/lm_eval/tasks/glue/README.md b/lm_eval/tasks/glue/README.md
index 573c640e..91c35cb4 100644
--- a/lm_eval/tasks/glue/README.md
+++ b/lm_eval/tasks/glue/README.md
@@ -41,10 +41,14 @@ Homepage: https://gluebenchmark.com/
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
+None.
+
+#### Tags
+
 * `glue`: Run all Glue subtasks.
 
 #### Tasks
diff --git a/lm_eval/tasks/glue/cola/default.yaml b/lm_eval/tasks/glue/cola/default.yaml
index a46003c2..7f79e5e8 100644
--- a/lm_eval/tasks/glue/cola/default.yaml
+++ b/lm_eval/tasks/glue/cola/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: cola
 dataset_path: glue
 dataset_name: cola
diff --git a/lm_eval/tasks/glue/mnli/default.yaml b/lm_eval/tasks/glue/mnli/default.yaml
index 6caffa85..654f6123 100644
--- a/lm_eval/tasks/glue/mnli/default.yaml
+++ b/lm_eval/tasks/glue/mnli/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: mnli
 dataset_path: glue
 dataset_name: mnli
diff --git a/lm_eval/tasks/glue/mrpc/default.yaml b/lm_eval/tasks/glue/mrpc/default.yaml
index f0bc2451..0cdbb8bb 100644
--- a/lm_eval/tasks/glue/mrpc/default.yaml
+++ b/lm_eval/tasks/glue/mrpc/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: mrpc
 dataset_path: glue
 dataset_name: mrpc
diff --git a/lm_eval/tasks/glue/qnli/default.yaml b/lm_eval/tasks/glue/qnli/default.yaml
index 49a6216a..e069209e 100644
--- a/lm_eval/tasks/glue/qnli/default.yaml
+++ b/lm_eval/tasks/glue/qnli/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: qnli
 dataset_path: glue
 dataset_name: qnli
diff --git a/lm_eval/tasks/glue/qqp/default.yaml b/lm_eval/tasks/glue/qqp/default.yaml
index bcd82f26..f76da063 100644
--- a/lm_eval/tasks/glue/qqp/default.yaml
+++ b/lm_eval/tasks/glue/qqp/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: qqp
 dataset_path: glue
 dataset_name: qqp
diff --git a/lm_eval/tasks/glue/rte/default.yaml b/lm_eval/tasks/glue/rte/default.yaml
index 7b12096a..216c5210 100644
--- a/lm_eval/tasks/glue/rte/default.yaml
+++ b/lm_eval/tasks/glue/rte/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: rte
 dataset_path: glue
 dataset_name: rte
diff --git a/lm_eval/tasks/glue/sst2/default.yaml b/lm_eval/tasks/glue/sst2/default.yaml
index 838afeb2..160e3e08 100644
--- a/lm_eval/tasks/glue/sst2/default.yaml
+++ b/lm_eval/tasks/glue/sst2/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: sst2
 dataset_path: glue
 dataset_name: sst2
diff --git a/lm_eval/tasks/glue/wnli/default.yaml b/lm_eval/tasks/glue/wnli/default.yaml
index a8e57a35..63966e4c 100644
--- a/lm_eval/tasks/glue/wnli/default.yaml
+++ b/lm_eval/tasks/glue/wnli/default.yaml
@@ -1,4 +1,4 @@
-group: glue
+tag: glue
 task: wnli
 dataset_path: glue
 dataset_name: wnli
diff --git a/lm_eval/tasks/gpqa/README.md b/lm_eval/tasks/gpqa/README.md
index 03c94bd1..7c148d9f 100644
--- a/lm_eval/tasks/gpqa/README.md
+++ b/lm_eval/tasks/gpqa/README.md
@@ -25,11 +25,15 @@ Homepage: `https://github.com/idavidrein/gpqa/tree/main`
 
 This dataset is gated, so you will have to accept the terms of use at https://huggingface.co/datasets/Idavidrein/gpqa and login via `huggingface-cli login` using your HF Hub token before running this task.
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
-* `gpqa`
+None
+
+#### Tags
+
+* `gpqa`: runs all GPQA variants.
 
 #### Tasks
 
diff --git a/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml b/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml
index 193539b9..d26a68b8 100644
--- a/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml
@@ -1,5 +1,5 @@
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: generate_until
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml b/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml
index df99f272..8c487a8c 100644
--- a/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml
@@ -1,5 +1,5 @@
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: generate_until
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml b/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
index 8ddbb331..e1f00c94 100644
--- a/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
@@ -1,5 +1,5 @@
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: generate_until
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml b/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
index 96d4fc25..35409ca4 100644
--- a/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
@@ -1,5 +1,5 @@
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml b/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml
index 707641b5..500f1921 100644
--- a/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml
+++ b/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml
@@ -1,5 +1,5 @@
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
index d92ee342..0994081b 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
@@ -1,5 +1,5 @@
 include: gsm8k-cot.yaml
-group:
+tag:
   - chain_of_thought
   - self_consistency
 task: gsm8k_cot_self_consistency
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml
index 75d4468a..c112d324 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - math_word_problems
 task: gsm8k_cot_zeroshot
 dataset_path: gsm8k
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index 5947fb75..d125b019 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -61,7 +61,7 @@ generation_kwargs:
   - 'Q:'
   - </s>
   - <|im_end|>
-group:
+tag:
 - chain_of_thought
 metadata:
   version: 3.0
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index 2c4ef836..a9d5bb39 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - math_word_problems
 task: gsm8k
 dataset_path: gsm8k
diff --git a/lm_eval/tasks/haerae/_default_haerae_yaml b/lm_eval/tasks/haerae/_default_haerae_yaml
index c22e3d15..807c10e0 100644
--- a/lm_eval/tasks/haerae/_default_haerae_yaml
+++ b/lm_eval/tasks/haerae/_default_haerae_yaml
@@ -1,4 +1,3 @@
-group: haerae
 dataset_path: HAERAE-HUB/HAE_RAE_BENCH
 test_split: test
 fewshot_split: test
diff --git a/lm_eval/tasks/haerae/_haerae.yaml b/lm_eval/tasks/haerae/_haerae.yaml
new file mode 100644
index 00000000..05a40502
--- /dev/null
+++ b/lm_eval/tasks/haerae/_haerae.yaml
@@ -0,0 +1,16 @@
+group: haerae
+task:
+  - haerae_gk
+  - haerae_hi
+  - haerae_lw
+  - haerae_rw
+  - haerae_sn
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/headqa/headqa_en.yaml b/lm_eval/tasks/headqa/headqa_en.yaml
index eeb2ff12..3d1b0b3e 100644
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
@@ -1,5 +1,4 @@
-group:
-  - headqa
+tag: headqa
 task: headqa_en
 dataset_path: EleutherAI/headqa
 dataset_name: en
diff --git a/lm_eval/tasks/hellaswag/hellaswag.yaml b/lm_eval/tasks/hellaswag/hellaswag.yaml
index 577ca69e..c0735bbd 100644
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - multiple_choice
 task: hellaswag
 dataset_path: hellaswag
diff --git a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
index f1270743..d4f13437 100644
--- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - hendrycks_ethics
 task: ethics_cm
 dataset_path: EleutherAI/hendrycks_ethics
diff --git a/lm_eval/tasks/hendrycks_ethics/justice.yaml b/lm_eval/tasks/hendrycks_ethics/justice.yaml
index 2ba79d16..a67f55c7 100644
--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -1,5 +1,5 @@
 include: deontology.yaml
-group:
+tag:
   - hendrycks_ethics
 task: ethics_justice
 dataset_name: justice
diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
index 8960a319..81134de9 100644
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
@@ -1,5 +1,5 @@
 include: commonsense.yaml
-group:
+tag:
   - hendrycks_ethics
 task: ethics_utilitarianism
 dataset_name: utilitarianism
diff --git a/lm_eval/tasks/hendrycks_ethics/virtue.yaml b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
index 8c236a98..b456e4a5 100644
--- a/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
@@ -1,5 +1,5 @@
 include: commonsense.yaml
-group:
+tag:
   - hendrycks_ethics
 task: ethics_virtue
 dataset_name: virtue
diff --git a/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml b/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
index b3035654..d01cf9b2 100644
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
@@ -7,3 +7,9 @@ task:
   - hendrycks_math_num_theory
   - hendrycks_math_prealgebra
   - hendrycks_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml b/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
index 66f31fd9..9ce9c9c5 100644
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - math_word_problems
 task: hendrycks_math_algebra
 dataset_path: EleutherAI/hendrycks_math
diff --git a/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
index 4dc3df7a..c504eb2c 100644
--- a/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
+++ b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - inverse_scaling_mc
 output_type: multiple_choice
 test_split: train
diff --git a/lm_eval/tasks/inverse_scaling/_some_results b/lm_eval/tasks/inverse_scaling/_some_results
new file mode 100644
index 00000000..9afe58d8
--- /dev/null
+++ b/lm_eval/tasks/inverse_scaling/_some_results
@@ -0,0 +1,39 @@
+# |                   Tasks                   |Version|Filter|n-shot| Metric |Value |   |Stderr|
+# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
+# | - inverse_scaling_hindsight_neglect_10shot|      0|none  |     0|acc     |0.4476|±  |0.0281|
+# |                                           |       |none  |     0|acc_norm|0.4476|±  |0.0281|
+# |inverse_scaling_mc                         |N/A    |none  |     0|acc_norm|0.6273|±  |0.0096|
+# |                                           |       |none  |     0|acc     |0.6210|±  |0.0095|
+# | - inverse_scaling_neqa                    |      0|none  |     0|acc     |0.5300|±  |0.0289|
+# |                                           |       |none  |     0|acc_norm|0.5300|±  |0.0289|
+# | - inverse_scaling_quote_repetition        |      0|none  |     0|acc     |0.9367|±  |0.0141|
+# |                                           |       |none  |     0|acc_norm|0.9367|±  |0.0141|
+# | - inverse_scaling_redefine_math           |      0|none  |     0|acc     |0.7178|±  |0.0150|
+# |                                           |       |none  |     0|acc_norm|0.7178|±  |0.0150|
+# | - inverse_scaling_winobias_antistereotype |      0|none  |     0|acc     |0.3786|±  |0.0239|
+# |                                           |       |none  |     0|acc_norm|0.4126|±  |0.0243|
+
+# |      Groups      |Version|Filter|n-shot| Metric |Value |   |Stderr|
+# |------------------|-------|------|-----:|--------|-----:|---|-----:|
+# |inverse_scaling_mc|N/A    |none  |     0|acc_norm|0.6273|±  |0.0096|
+# |                  |       |none  |     0|acc     |0.6210|±  |0.0095|
+# hf (pretrained=facebook/opt-2.7b,add_bos_token=True,dtype=float32), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (32)
+# |                   Tasks                   |Version|Filter|n-shot| Metric |Value |   |Stderr|
+# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
+# | - inverse_scaling_hindsight_neglect_10shot|      0|none  |     0|acc     |0.4476|±  |0.0281|
+# |                                           |       |none  |     0|acc_norm|0.4476|±  |0.0281|
+# |inverse_scaling_mc                         |N/A    |none  |     0|acc_norm|0.6291|±  |0.0095|
+# |                                           |       |none  |     0|acc     |0.6219|±  |0.0095|
+# | - inverse_scaling_neqa                    |      0|none  |     0|acc     |0.5267|±  |0.0289|
+# |                                           |       |none  |     0|acc_norm|0.5267|±  |0.0289|
+# | - inverse_scaling_quote_repetition        |      0|none  |     0|acc     |0.9433|±  |0.0134|
+# |                                           |       |none  |     0|acc_norm|0.9433|±  |0.0134|
+# | - inverse_scaling_redefine_math           |      0|none  |     0|acc     |0.7200|±  |0.0150|
+# |                                           |       |none  |     0|acc_norm|0.7200|±  |0.0150|
+# | - inverse_scaling_winobias_antistereotype |      0|none  |     0|acc     |0.3762|±  |0.0239|
+# |                                           |       |none  |     0|acc_norm|0.4150|±  |0.0243|
+
+# |      Groups      |Version|Filter|n-shot| Metric |Value |   |Stderr|
+# |------------------|-------|------|-----:|--------|-----:|---|-----:|
+# |inverse_scaling_mc|N/A    |none  |     0|acc_norm|0.6291|±  |0.0095|
+# |                  |       |none  |     0|acc     |0.6219|±  |0.0095|
diff --git a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
index a0cc644c..163a03df 100644
--- a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
     - kmmlu
     - kmmlu_hard_cot
 dataset_path: HAERAE-HUB/KMMLU-HARD
diff --git a/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml b/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml
index 9a4a6dbc..a0c8dfdc 100644
--- a/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
     - kmmlu
     - kmmlu_direct
 dataset_path: HAERAE-HUB/KMMLU
diff --git a/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml b/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml
index 259b5c86..3cf63592 100644
--- a/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
     - kmmlu
     - kmmlu_hard_direct
 dataset_path: HAERAE-HUB/KMMLU-HARD
diff --git a/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml b/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
index f5e97c16..26c4105b 100644
--- a/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
     - kmmlu
     - kmmlu_hard
 dataset_path: HAERAE-HUB/KMMLU-HARD
diff --git a/lm_eval/tasks/kobest/kobest_boolq.yaml b/lm_eval/tasks/kobest/kobest_boolq.yaml
index e9932d56..cb35d063 100644
--- a/lm_eval/tasks/kobest/kobest_boolq.yaml
+++ b/lm_eval/tasks/kobest/kobest_boolq.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - kobest
 task: kobest_boolq
 dataset_path: skt/kobest_v1
diff --git a/lm_eval/tasks/kobest/kobest_copa.yaml b/lm_eval/tasks/kobest/kobest_copa.yaml
index 1f3b34e6..2a6a931a 100644
--- a/lm_eval/tasks/kobest/kobest_copa.yaml
+++ b/lm_eval/tasks/kobest/kobest_copa.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - kobest
 task: kobest_copa
 dataset_path: skt/kobest_v1
diff --git a/lm_eval/tasks/kobest/kobest_hellaswag.yaml b/lm_eval/tasks/kobest/kobest_hellaswag.yaml
index d83266a8..e463b817 100644
--- a/lm_eval/tasks/kobest/kobest_hellaswag.yaml
+++ b/lm_eval/tasks/kobest/kobest_hellaswag.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - kobest
 task: kobest_hellaswag
 dataset_path: skt/kobest_v1
diff --git a/lm_eval/tasks/kobest/kobest_sentineg.yaml b/lm_eval/tasks/kobest/kobest_sentineg.yaml
index 64319dca..e5eed0d4 100644
--- a/lm_eval/tasks/kobest/kobest_sentineg.yaml
+++ b/lm_eval/tasks/kobest/kobest_sentineg.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - kobest
 task: kobest_sentineg
 dataset_path: skt/kobest_v1
diff --git a/lm_eval/tasks/kobest/kobest_wic.yaml b/lm_eval/tasks/kobest/kobest_wic.yaml
index 569d3393..7d2ea140 100644
--- a/lm_eval/tasks/kobest/kobest_wic.yaml
+++ b/lm_eval/tasks/kobest/kobest_wic.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - kobest
 task: kobest_wic
 dataset_path: skt/kobest_v1
diff --git a/lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml b/lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
new file mode 100644
index 00000000..d6548334
--- /dev/null
+++ b/lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
@@ -0,0 +1,11 @@
+group: kormedmcqa
+task:
+  - kormedmcqa_doctor
+  - kormedmcqa_nurse
+  - kormedmcqa_pharm
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml b/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
index 3977569f..d130dbe8 100644
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
@@ -1,4 +1,3 @@
-group: kormedmcqa
 task : kormedmcqa_doctor
 dataset_path : sean0042/KorMedMCQA
 dataset_name : doctor
diff --git a/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml b/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
index 264fb9a7..026b6217 100644
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
@@ -1,4 +1,3 @@
-group: kormedmcqa
 task : kormedmcqa_nurse
 dataset_path : sean0042/KorMedMCQA
 dataset_name : nurse
diff --git a/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml b/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
index bd3e6a31..91279dd7 100644
--- a/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
+++ b/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
@@ -1,4 +1,3 @@
-group: kormedmcqa
 task : kormedmcqa_pharm
 dataset_path : sean0042/KorMedMCQA
 dataset_name : pharm
diff --git a/lm_eval/tasks/lambada/lambada_openai.yaml b/lm_eval/tasks/lambada/lambada_openai.yaml
index e9fd3a90..8808b5fd 100644
--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - lambada
 task: lambada_openai
 dataset_path: EleutherAI/lambada_openai
diff --git a/lm_eval/tasks/lambada/lambada_standard.yaml b/lm_eval/tasks/lambada/lambada_standard.yaml
index 900e1811..f76c8dfe 100644
--- a/lm_eval/tasks/lambada/lambada_standard.yaml
+++ b/lm_eval/tasks/lambada/lambada_standard.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - lambada
 task: lambada_standard
 dataset_path: lambada
diff --git a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
index d25e26d9..602a3ff4 100644
--- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - lambada_cloze
 task: lambada_openai_cloze_yaml
 dataset_path: EleutherAI/lambada_openai
diff --git a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
index 7cde8fde..9c1568b3 100644
--- a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - lambada_cloze
 task: lambada_standard_cloze_yaml
 dataset_path: lambada
diff --git a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
index 7e63a6d1..8af05cdb 100644
--- a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+++ b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - lambada_multilingual
 task: lambada_openai_mt_en
 dataset_path: EleutherAI/lambada_openai
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
index c461f75f..bc68ae99 100644
--- a/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
@@ -1,4 +1,3 @@
-group: leaderboard_bbh
 dataset_path: SaylorTwift/bbh
 output_type: multiple_choice
 test_split: test
diff --git a/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml b/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
new file mode 100644
index 00000000..9cc10d39
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
@@ -0,0 +1,26 @@
+group: leaderboard_bbh
+task:
+  - leaderboard_bbh_boolean_expressions
+  - leaderboard_bbh_causal_judgement
+  - leaderboard_bbh_date_understanding
+  - leaderboard_bbh_disambiguation_qa
+  - leaderboard_bbh_formal_fallacies
+  - leaderboard_bbh_geometric_shapes
+  - leaderboard_bbh_hyperbaton
+  - leaderboard_bbh_logical_deduction_five_objects
+  - leaderboard_bbh_logical_deduction_seven_objects
+  - leaderboard_bbh_logical_deduction_three_objects
+  - leaderboard_bbh_movie_recommendation
+  - leaderboard_bbh_navigate
+  - leaderboard_bbh_object_counting
+  - leaderboard_bbh_penguins_in_a_table
+  - leaderboard_bbh_reasoning_about_colored_objects
+  - leaderboard_bbh_ruin_names
+  - leaderboard_bbh_salient_translation_error_detection
+  - leaderboard_bbh_snarks
+  - leaderboard_bbh_sports_understanding
+  - leaderboard_bbh_temporal_sequences
+  - leaderboard_bbh_tracking_shuffled_objects_five_objects
+  - leaderboard_bbh_tracking_shuffled_objects_seven_objects
+  - leaderboard_bbh_tracking_shuffled_objects_three_objects
+  - leaderboard_bbh_web_of_lies
diff --git a/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml b/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
new file mode 100644
index 00000000..448c14ac
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
@@ -0,0 +1,5 @@
+group: leaderboard_gpqa
+task:
+  - leaderboard_gpqa_diamond
+  - leaderboard_gpqa_extended
+  - leaderboard_gpqa_main
diff --git a/lm_eval/tasks/leaderboard/gpqa/_template_yaml b/lm_eval/tasks/leaderboard/gpqa/_template_yaml
index 83cb1872..f4047292 100644
--- a/lm_eval/tasks/leaderboard/gpqa/_template_yaml
+++ b/lm_eval/tasks/leaderboard/gpqa/_template_yaml
@@ -1,5 +1,4 @@
 dataset_path: Idavidrein/gpqa
-group: leaderboard_gpqa
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 training_split: train
diff --git a/lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml b/lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml
new file mode 100644
index 00000000..a1203b00
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml
@@ -0,0 +1,3 @@
+group: leaderboard_instruction_following
+task:
+  - leaderboard_ifeval
diff --git a/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
index da394d9c..cab7498b 100644
--- a/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
+++ b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml
@@ -1,5 +1,4 @@
 task: leaderboard_ifeval
-group: leaderboard_instruction_following
 dataset_path: wis-k/instruction-following-eval
 dataset_name: null
 output_type: generate_until
diff --git a/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml b/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
new file mode 100644
index 00000000..45cf8f13
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
@@ -0,0 +1,9 @@
+group: leaderboard_math_hard
+task:
+  - leaderboard_math_algebra_hard
+  - leaderboard_math_counting_and_prob_hard
+  - leaderboard_math_geometry_hard
+  - leaderboard_math_intermediate_algebra_hard
+  - leaderboard_math_num_theory_hard
+  - leaderboard_math_prealgebra_hard
+  - leaderboard_math_precalculus_hard
diff --git a/lm_eval/tasks/leaderboard/math/_template_yaml b/lm_eval/tasks/leaderboard/math/_template_yaml
index a121074b..9c404b0c 100644
--- a/lm_eval/tasks/leaderboard/math/_template_yaml
+++ b/lm_eval/tasks/leaderboard/math/_template_yaml
@@ -1,5 +1,3 @@
-group:
-  - leaderboard_math_hard
 dataset_path: lighteval/MATH-Hard
 process_docs: !function utils.process_docs
 output_type: generate_until
diff --git a/lm_eval/tasks/leaderboard/musr/_musr.yaml b/lm_eval/tasks/leaderboard/musr/_musr.yaml
new file mode 100644
index 00000000..060d231a
--- /dev/null
+++ b/lm_eval/tasks/leaderboard/musr/_musr.yaml
@@ -0,0 +1,5 @@
+group: leaderboard_musr
+task:
+  - leaderboard_musr_murder_mysteries
+  - leaderboard_musr_object_placements
+  - leaderboard_musr_team_allocation
diff --git a/lm_eval/tasks/leaderboard/musr/_template_yaml b/lm_eval/tasks/leaderboard/musr/_template_yaml
index 2076cb9f..d1408124 100644
--- a/lm_eval/tasks/leaderboard/musr/_template_yaml
+++ b/lm_eval/tasks/leaderboard/musr/_template_yaml
@@ -1,5 +1,3 @@
-group:
-  - leaderboard_musr
 dataset_path: TAUR-Lab/MuSR
 output_type: multiple_choice
 doc_to_text: !function utils.doc_to_text
diff --git a/lm_eval/tasks/mathqa/mathqa.yaml b/lm_eval/tasks/mathqa/mathqa.yaml
index e37ba118..23fc01d2 100644
--- a/lm_eval/tasks/mathqa/mathqa.yaml
+++ b/lm_eval/tasks/mathqa/mathqa.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - math_word_problems
 task: mathqa
 dataset_path: math_qa
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index cb4a076d..28b94616 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -110,10 +110,9 @@ if __name__ == "__main__":
 
         yaml_dict = {
             "include": base_yaml_name,
-            "group": f"mmlu_{args.task_prefix}_{category}"
+            "tag": f"mmlu_{args.task_prefix}_{category}"
             if args.task_prefix != ""
             else f"mmlu_{category}",
-            "group_alias": category.replace("_", " "),
             "task": f"mmlu_{args.task_prefix}_{subject}"
             if args.task_prefix != ""
             else f"mmlu_{subject}",
diff --git a/lm_eval/tasks/mmlu/continuation/_mmlu.yaml b/lm_eval/tasks/mmlu/continuation/_mmlu.yaml
index 77ef1b1d..912ae25a 100644
--- a/lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/continuation/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_continuation
+group_alias: mmlu (continuation)
 task:
-  - mmlu_continuation_stem
-  - mmlu_continuation_other
-  - mmlu_continuation_social_sciences
-  - mmlu_continuation_humanities
+  - group: stem
+    task:
+      - mmlu_continuation_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_continuation_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_continuation_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_continuation_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
index 962ee9c8..6f4e29c0 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
index efb8ab56..bc3de9c4 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "anatomy"
 "description": "The following are questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_anatomy"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
index 7cebe62c..76aabcbf 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "astronomy"
 "description": "The following are questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_astronomy"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
index 52e47605..e64d0920 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "business_ethics"
 "description": "The following are questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_business_ethics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
index a0abd049..e79805df 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
index a2c8570c..936f6ffe 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_biology"
 "description": "The following are questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_biology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
index cfb11c87..289364ee 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_chemistry"
 "description": "The following are questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
index 67c6548e..c7d3c569 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_computer_science"
 "description": "The following are questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
index f694391b..2dbc0932 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_mathematics"
 "description": "The following are questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
index 97b83a3f..38abd242 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_medicine"
 "description": "The following are questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_medicine"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
index b01e3ff1..ee6b4258 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_physics"
 "description": "The following are questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_college_physics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
index 8ba7f5b0..7ebb487d 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "computer_security"
 "description": "The following are questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_computer_security"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
index 8a4fd296..7c554caf 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
index 2bd65265..848ce4e1 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "econometrics"
 "description": "The following are questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_econometrics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml
index b7e9cbdb..d71dd164 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml
index 689dc17c..fe8aa097 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml
index 149b6c84..eb5dbd2e 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "formal_logic"
 "description": "The following are questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_formal_logic"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml
index e02fba5f..280a50d2 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "global_facts"
 "description": "The following are questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_global_facts"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml
index f774f958..e518a523 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_biology"
 "description": "The following are questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml
index c6a338a7..c38d60a7 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml
index 40389063..5fe34f7a 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml
index 0ef0cbd2..666c2742 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml
index 301280b8..41f6caf3 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_geography"
 "description": "The following are questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml
index 792ed584..e80233dc 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml
index 858d91f6..ce7fa9d5 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml
index 9c64bb05..2598dcb3 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml
index df33f528..96c414d3 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml
index 562993b6..45ab0a53 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_physics"
 "description": "The following are questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml
index b3b4a934..48dedf5c 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml
index be899e1d..2ee2418c 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml
index ad16ce7d..a00f16ce 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml
index dc2bcbfb..dc4cddf5 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml
index 1ad19e9a..314edeb6 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_aging"
 "description": "The following are questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_human_aging"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml
index 607a951d..a1473819 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_sexuality"
 "description": "The following are questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml
index 0783ce18..5ea8944b 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "international_law"
 "description": "The following are questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_international_law"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml
index 37de68e1..fca1dda8 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "jurisprudence"
 "description": "The following are questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml
index d5b37d4a..1b576f9f 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml
index 7fc0ffd4..15fc3f4b 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "machine_learning"
 "description": "The following are questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_continuation_stem"
+"tag": "mmlu_continuation_stem"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_machine_learning"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml
index fe25108f..575604e0 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "management"
 "description": "The following are questions (with answers) about management.\n\
   \n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_management"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml
index 2beebd29..af715bee 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "marketing"
 "description": "The following are questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_marketing"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml
index 04d63864..3bf63614 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "medical_genetics"
 "description": "The following are questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml
index 95367c77..f4578009 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "miscellaneous"
 "description": "The following are questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml
index 82780df7..0df1392d 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_disputes"
 "description": "The following are questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml
index d60487e5..bea5e514 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml
index b502fcf7..8db80340 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "nutrition"
 "description": "The following are questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_nutrition"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml
index e3cc3d45..165de6c9 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "philosophy"
 "description": "The following are questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_philosophy"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml
index 2c29ee75..02c4ee7f 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "prehistory"
 "description": "The following are questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_prehistory"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml
index 588a4d43..bb36a82b 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_accounting"
 "description": "The following are questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml
index ea8b35a0..ac9f2592 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_law"
 "description": "The following are questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_professional_law"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml
index 73bdc706..328c1283 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_medicine"
 "description": "The following are questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml
index dab85998..0cca5bde 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_psychology"
 "description": "The following are questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml
index d4cffea2..700c407c 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "public_relations"
 "description": "The following are questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_public_relations"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml
index df0adc78..4f5ef99e 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "security_studies"
 "description": "The following are questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_security_studies"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml
index 5da671e1..e78621aa 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "sociology"
 "description": "The following are questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_sociology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml
index a426a5ec..989bb29a 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_continuation_social_sciences"
+"tag": "mmlu_continuation_social_sciences"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml
index 589abc86..5c938190 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "virology"
 "description": "The following are questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_continuation_other"
+"tag": "mmlu_continuation_other"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_virology"
diff --git a/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml
index 532442ef..f7076700 100644
--- a/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "world_religions"
 "description": "The following are questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_continuation_humanities"
+"tag": "mmlu_continuation_humanities"
 "include": "_continuation_template_yaml"
 "task": "mmlu_continuation_world_religions"
diff --git a/lm_eval/tasks/mmlu/default/_mmlu.yaml b/lm_eval/tasks/mmlu/default/_mmlu.yaml
index 584de029..84898cf8 100644
--- a/lm_eval/tasks/mmlu/default/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu.yaml
@@ -4,3 +4,8 @@ task:
   - mmlu_other
   - mmlu_social_sciences
   - mmlu_humanities
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml b/lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
new file mode 100644
index 00000000..c51fad2c
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
@@ -0,0 +1,9 @@
+group: mmlu_humanities
+group_alias: humanities
+task:
+  - mmlu_humanities_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/default/_mmlu_other.yaml b/lm_eval/tasks/mmlu/default/_mmlu_other.yaml
new file mode 100644
index 00000000..8f6d5672
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_mmlu_other.yaml
@@ -0,0 +1,9 @@
+group: mmlu_other
+group_alias: other
+task:
+  - mmlu_other_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml b/lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
new file mode 100644
index 00000000..b37097b1
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
@@ -0,0 +1,9 @@
+group: mmlu_social_sciences
+group_alias: social sciences
+task:
+  - mmlu_social_sciences_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/default/_mmlu_stem.yaml b/lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
new file mode 100644
index 00000000..009449ec
--- /dev/null
+++ b/lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
@@ -0,0 +1,9 @@
+group: mmlu_stem
+group_alias: stem
+task:
+  - mmlu_stem_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
index 90f3cc50..dcde12cb 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
 "task_alias": "abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
index 0e9e09b2..5fef7490 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_anatomy"
 "task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
index e3bdfc95..660f0747 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_astronomy"
 "task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
index ea0d1fe2..a0f1b1c2 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_business_ethics"
 "task_alias": "business_ethics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
index 20bab147..1c72b716 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_clinical_knowledge"
 "task_alias": "clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
index afb4d9c6..ddfd713a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_biology"
 "task_alias": "college_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
index a7de3532..388c3a91 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_chemistry"
 "task_alias": "college_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
index 9786cc6e..a3f69242 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_computer_science"
 "task_alias": "college_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
index e7699f8b..08a9628a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_mathematics"
 "task_alias": "college_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
index df9e8901..35197a2a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_medicine"
 "task_alias": "college_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
index 3c5e7462..9b5017af 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_college_physics"
 "task_alias": "college_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
index df9c4a51..8f9b4282 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_computer_security"
 "task_alias": "computer_security"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
index 8ab59ed1..af61a7e1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_conceptual_physics"
 "task_alias": "conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
index a974fc84..609c20af 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_econometrics"
 "task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
index 9c45cc61..8fa2137a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_electrical_engineering"
 "task_alias": "electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
index 2154ab65..d15f6d5a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_elementary_mathematics"
 "task_alias": "elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
index 689d3d1f..ee2fc2f6 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_formal_logic"
 "task_alias": "formal_logic"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
index 60b5c129..b27ddefd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_global_facts"
 "task_alias": "global_facts"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
index c7e055dc..22bc4794 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_biology"
 "task_alias": "high_school_biology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
index 8e9421c1..5a25617c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_chemistry"
 "task_alias": "high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
index 87ec15cc..ad4c7d31 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_computer_science"
 "task_alias": "high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
index be0d696a..7c51bbdd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_european_history"
 "task_alias": "high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
index 57c5261a..aad87f1a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_geography"
 "task_alias": "high_school_geography"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
index 2e92f152..2b93b363 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_government_and_politics"
 "task_alias": "high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
index 988d132a..a08c579d 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_macroeconomics"
 "task_alias": "high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
index f7c07a60..3f1b6d70 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_mathematics"
 "task_alias": "high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
index 5339a023..ac4f65da 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_microeconomics"
 "task_alias": "high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
index 0fae0405..b8c449aa 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_physics"
 "task_alias": "high_school_physics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
index 31ecb18e..47ba836c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_psychology"
 "task_alias": "high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
index 54d70880..ef5bdd7c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_statistics"
 "task_alias": "high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
index e4432fe4..ececdb0a 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_us_history"
 "task_alias": "high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
index 08773a20..af34c8dd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_world_history"
 "task_alias": "high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
index c9e1feb1..3ca720be 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_human_aging"
 "task_alias": "human_aging"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
index 715859a1..2acddd1e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_human_sexuality"
 "task_alias": "human_sexuality"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
index 68765225..9fb2a162 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_international_law.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_international_law"
 "task_alias": "international_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
index e16de5c4..3c62a911 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_jurisprudence"
 "task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
index 8b12057b..adf8821e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_logical_fallacies"
 "task_alias": "logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
index 2387d680..d846f960 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
+"tag": "mmlu_stem_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_machine_learning"
 "task_alias": "machine_learning"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_management.yaml b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
index d0cdc812..7dff834e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_management.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_management"
 "task_alias": "management"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
index a614db29..4ef00498 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_marketing.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_marketing"
 "task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
index 5d7ce708..989fb2c1 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_medical_genetics"
 "task_alias": "medical_genetics"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
index 77e819cf..e7bb68bc 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_miscellaneous"
 "task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
index 2df1a1dd..348d2140 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_disputes"
 "task_alias": "moral_disputes"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
index 6da63cb2..3762ee12 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_scenarios"
 "task_alias": "moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
index df70fbb2..55f8ca01 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_nutrition"
 "task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
index 9dba09c1..5331c812 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_philosophy"
 "task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
index d787898c..0b4ff970 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_prehistory"
 "task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
index 3443c336..27b2ec9b 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_accounting"
 "task_alias": "professional_accounting"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
index f3a02631..07c36f1c 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_law"
 "task_alias": "professional_law"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
index e8c49b5e..2c5754bf 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_medicine"
 "task_alias": "professional_medicine"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
index ec48a06f..e0c0608e 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_psychology"
 "task_alias": "professional_psychology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
index db36fb49..43b675bd 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_public_relations"
 "task_alias": "public_relations"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
index 072dfd70..b02342d9 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_security_studies"
 "task_alias": "security_studies"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
index efcbd27b..49fa1162 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_sociology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_sociology"
 "task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
index d80ee94a..bc4335e9 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_social_sciences"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_us_foreign_policy"
 "task_alias": "us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
index d935f92a..8f8bc114 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_virology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_other"
-"group_alias": "other"
+"tag": "mmlu_other_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_virology"
 "task_alias": "virology"
diff --git a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
index 8681354f..b90972c7 100644
--- a/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_humanities"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_tasks"
 "include": "_default_template_yaml"
 "task": "mmlu_world_religions"
 "task_alias": "world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
index cb43b048..ac1e48cf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_flan_cot_fewshot
+group_alias: mmlu (flan style, fewshot cot)
 task:
-  - mmlu_flan_cot_fewshot_stem
-  - mmlu_flan_cot_fewshot_other
-  - mmlu_flan_cot_fewshot_social_sciences
-  - mmlu_flan_cot_fewshot_humanities
+  - group: stem
+    task:
+      - mmlu_flan_cot_fewshot_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_flan_cot_fewshot_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_flan_cot_fewshot_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_flan_cot_fewshot_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
index 3c51ab8a..6235d5c0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
@@ -54,6 +54,6 @@ fewshot_config:
       not have any roots. For c = 2 the polynomial x^2 + 2 has two roots at x = 1
       and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only if c = 1. The answer
       is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_abstract_algebra
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
index fd886e09..e6521bde 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
@@ -70,6 +70,6 @@ fewshot_config:
       \ origin of the hyoid bone are the second and the third pharyngeal arches\u2014\
       this information is covered in the last option (D). Therefore, we conclude that\
       \ (D) must be the correct answer. The answer is (D).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_anatomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
index 2b21444e..b8997458 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
@@ -65,6 +65,6 @@ fewshot_config:
       because it explains that the surface is red due to the rusted materials on the
       surface and the red color comes from the rust. So the correct option is (A).
       The answer is (A).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_astronomy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
index f43fd420..6c83d4bc 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
@@ -70,6 +70,6 @@ fewshot_config:
       \ moral arguments relating to: negative *externalities*, the *power* that corporations\
       \ possess and the *mutual independence* of business and society. The answer\
       \ is (D).\n\n"
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_business_ethics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
index df627d1c..008d2f87 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
@@ -43,6 +43,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on clinical
       knowledge for help. The energy for muscular contraction is provided by ATP (adenosine
       triphosphate), which is the powerhouse of the cell. The answer is (A).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_clinical_knowledge
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
index 4a976498..28f7f989 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
@@ -70,6 +70,6 @@ fewshot_config:
       that have different origins, which is not the case for the human and bird forearms,
       which rules out (D). Humans and birds do belong to the same clade - a group
       of organisms composed of a common ancestor. The answer is (C).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
index b337ade0..4a8cfc9e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
@@ -44,6 +44,6 @@ fewshot_config:
       \ into 2 lines. This will be further split into 4 lines by the interaction with\
       \ three equivalent 1H nuclei. The total number of lines is therefore $2 \\cdot\
       \ 4 = 8$. The answer is (E).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
index 6e19ffed..5eccde7c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
@@ -175,6 +175,6 @@ fewshot_config:
       (1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions
       / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer
       is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
index bac83109..5552cc35 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
@@ -68,6 +68,6 @@ fewshot_config:
       \ Then, for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and\
       \ so $s(t)=2+K e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K\
       \ e^{-100 / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
index deee7daf..7eac0bab 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
@@ -63,6 +63,6 @@ fewshot_config:
       for help. Glucose (also known as the blood sugar) is the main sugar found in
       the human body. It is transported into the muscle cell via diffusion through
       protein transporters called GLUT4. The answer is (A).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
index c935b488..aa158a9f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
@@ -56,6 +56,6 @@ fewshot_config:
       of the gas container is constant, no work will be done (since work is pressure
       times change in volume). So, at constant volume, all of the heat goes into the
       internal energy. The answer is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_college_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
index d4d2d24c..6a7c5cb2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
@@ -45,6 +45,6 @@ fewshot_config:
       of the TLS heartbeat extension. The vulnerability was classified as a buffer
       over-read, a situation where more data can be read than should be allowed. The
       answer is (C).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_computer_security
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
index 85054dd3..a4757faf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
@@ -44,6 +44,6 @@ fewshot_config:
       \ orthogonal to the wind is the same as it would be in the absence of the wind.\
       \ The total speed, which is these two components added in quadrature, is thus\
       \ greater than the speed in still air. The answer is (B).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_conceptual_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
index 11100c8a..e4282345 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
@@ -82,6 +82,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on econometrics
       for help. This is a formal logic problem about stationally process. For a stationary
       autoregressive process, shocks will eventually die away. The answer is (A).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_econometrics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
index 5f2c5974..305d2340 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
@@ -42,6 +42,6 @@ fewshot_config:
     target: 'Let''s think step by step. In lap winding, effectively two resistors
       are connected in parallel, so the actual resistance of each pair is 1 Ohm. Since
       we have 50 pairs, we get a total resistance of 50 Ohms. The answer is (C).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_electrical_engineering
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
index 3da11ec1..1fb9eef8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
@@ -72,6 +72,6 @@ fewshot_config:
       (D) (5 x 9) x (6 x 9)'
     target: 'Let''s think step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5
       + 4) = (5 x 5) + (5 x 4). The answer is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_elementary_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
index 743deb85..3fbc73c3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml
@@ -65,6 +65,6 @@ fewshot_config:
       \ p do not drive on Mars.\nOf all these options, Option (C) appears to be the\
       \ best and most meaningful interpretation of the argument \u201CNo people drive\
       \ on Mars.\u201D The answer is (C).\n\n"
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_formal_logic
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
index 34e8c4b7..739c0010 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml
@@ -44,6 +44,6 @@ fewshot_config:
       for help. As of 2019, most people tend to be optimistic about their own future
       but pessimistic about the future of their nation or the world. The answer is
       (B).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_global_facts
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
index b780fd29..0dfb19f9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml
@@ -64,6 +64,6 @@ fewshot_config:
       core cell cycle regulators inside the cell. The most common regulators are cyclins
       and cyclin-dependent kinases. Fibroblast cells do not play any role in cell
       division. The answer is (D).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_biology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
index b10f03d4..948c3758 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml
@@ -61,6 +61,6 @@ fewshot_config:
       \ strong acid, Nitric acid, will react with the conjugate base. Therefore the\
       \ maximum amount of acid that can be added will be equal to the amount of acetate\
       \ ion, or 2 moles. The answer is (C).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_chemistry
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
index 5dee647a..6f21030a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml
@@ -79,6 +79,6 @@ fewshot_config:
       its value is greater than 100, regardless of the elements in the list. Choice
       D is incorrect because its step 3 does not increment the value of position,
       so it will repeat forever. The answer is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_computer_science
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
index 5a7cfb5c..4707a185 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml
@@ -194,6 +194,6 @@ fewshot_config:
       wrote extensively against the monoplization of power and advocated for a system
       of checks and balances in government to prevent the rise of despotism. The answer
       is (B).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_european_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
index 66f146b6..96f4b365 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml
@@ -48,6 +48,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on geography
       for help. The difference between number of births and deaths gives the population
       increase at any given time. The answer is (A).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_geography
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
index a22f99de..4c117721 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml
@@ -56,6 +56,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on government
       and politics for help. The US Constitution is not very specific about the powers
       of the president, leading to uncertainty over its limits. The answer is (A).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_government_and_politics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
index 5cecd65f..5700d5df 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml
@@ -48,6 +48,6 @@ fewshot_config:
       for help. The economic transactions related to the performance of the American
       pop-singer in Paris happens entirely outside the U.S. and hence is not included
       in the GDP numbers. The answer is (C).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_macroeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
index c86bca44..e3e05795 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml
@@ -46,6 +46,6 @@ fewshot_config:
     target: 'Let''s think step by step. The least common multiple of 2, 3 and 5 is
       30, so during a 7 minute dance, all the three lights will come on at the same
       time $2*7+1=15$ times. The answer is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_mathematics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
index c43e5511..ead78be8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml
@@ -51,6 +51,6 @@ fewshot_config:
       for help. An increase in the construction of new houses means an increase demand
       of in-house painting, thus increases the demand for housepainters. The answer
       is (C).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_microeconomics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
index 277329dc..5d2166b8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml
@@ -45,6 +45,6 @@ fewshot_config:
       cannot have any net displacement because the pipe closure stops them. So the
       particle displacement is at a node. This closure also causes the pressure to
       be maximal, i.e. an antinode. The answer is (B).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_physics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
index 7600dcbb..553fe18d 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml
@@ -59,6 +59,6 @@ fewshot_config:
       for help. People with an external locus of control believes fate and luck play
       an important role in their lives, while people with an internal locus of control
       believes they control their lives. The answer is (D).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
index 8822c4bf..66ed702a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml
@@ -76,6 +76,6 @@ fewshot_config:
       not perfectly correlated. Statement B is false because uncorrelated variables
       regression lines can have slope zero. Statement C is false because correlation
       is symmetric in the two random variables. The answer is (D).'
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_statistics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
index 54246224..8cea5109 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml
@@ -151,6 +151,6 @@ fewshot_config:
       suspect Washington''s military response to Whiskey Rebellion. Bacon''s Rebellion
       and Pontiac''s Rebellion happen before the Revolution and they can be ruled
       out. The answer is (C).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_us_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
index e39c5cb3..2fbdaf05 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml
@@ -95,6 +95,6 @@ fewshot_config:
       for help. Brahman refers to the ultimate reality of all things in the Hindu
       religion. In contrast, Buddhism does not have a concept of supreme God. The
       answer is (A).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_high_school_world_history
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
index 1cb9c640..3eec0108 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml
@@ -37,6 +37,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on human aging
       for help. Texas does not have state tax, and has low cost of living compared
       with the other three options. The answer is (A).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_human_aging
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
index 870c3c16..dab78f0c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml
@@ -45,6 +45,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on human sexuality
       for help. Morning sickness usually begins by nine weeks after conception, corresponding
       to the first trimester. The answer is (A).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_human_sexuality
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
index 89589dcd..99341f39 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml
@@ -65,6 +65,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on international
       law for help. Article 2(4) of the UN Charter prohibits states from using armed
       forces in their international relations. The answer is (A).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_international_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
index 383f3d3a..3c714f7e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml
@@ -54,6 +54,6 @@ fewshot_config:
       principle'', and reject the ''system of natural liberty'', but the POP would
       not choose equality above liberty, since the POP assume both equal and free
       citizens. The answer is (A).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_jurisprudence
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
index f09e4924..dd72cab8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml
@@ -56,6 +56,6 @@ fewshot_config:
       fallacies for help. The argument against the person fallacy occurs when someone
       irrelevantly attacks the character of an opposing arguer, instead of addressing
       that opponent''s arguments. The answer is (C).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_logical_fallacies
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
index 8cc48107..33622ac4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml
@@ -69,6 +69,6 @@ fewshot_config:
       \ epsilon when we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies\
       \ that N > -1/(2*epsilon**2) log ( 0.05/2 )= log (40)*5000. Since log(40)>1,\
       \ we have that one needs more than 1000 examples. The answer is (D).\n\n"
-group: mmlu_flan_cot_fewshot_stem
+tag: mmlu_flan_cot_fewshot_stem
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_machine_learning
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
index 63f6168b..87d9ba8c 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml
@@ -41,6 +41,6 @@ fewshot_config:
       for help. The Ohio State Leadership Studies conducted in the 1940s identified
       initiating structure and consideration as the two main dimensions of leader
       behavior. The answer is (D).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_management
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
index ec5caa40..182eb52e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml
@@ -51,6 +51,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on marketing
       for help. Geodemographics is a natural outcome when combining demographic and
       geographic variables. The answer is (A).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_marketing
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
index 7a7bbe75..757010bf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml
@@ -46,6 +46,6 @@ fewshot_config:
       genetics for help. A Southern blot is a method in molecular biology for detecting
       specific DNA sequences in a sample. Large triplet repeat expansions are usually
       detected with this method. The answer is (C).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_medical_genetics
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
index eb86cf4c..2fe892eb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml
@@ -38,6 +38,6 @@ fewshot_config:
       (A) one (B) two (C) four (D) eight'
     target: 'Let''s think step by step. We refer to Wikipedia for help. Most cars
       have two axles to rotate the wheels.. The answer is (B).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_miscellaneous
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
index e24621a9..45a92e07 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml
@@ -59,6 +59,6 @@ fewshot_config:
       as it treats individuals as incapable of communal relations. It is unclear that
       capital punishment is to the benefit of, or a deterrent of harm to the community.
       The answer is (A).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_moral_disputes
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
index f2217ef8..b4a02319 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml
@@ -57,6 +57,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on moral scenarios
       for help. Loving someone is not wrong. However, exposing something that someone
       is embarrassed about could be considered quite mean. The answer is (C).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_moral_scenarios
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
index f6a09b77..66498dc5 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml
@@ -58,6 +58,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on nutrition
       for help. The risk ratio is not sufficiently reduced that it could not be explained
       by random chance given the studies sample size. The answer is (C).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_nutrition
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
index 2877c760..b5116be0 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml
@@ -39,6 +39,6 @@ fewshot_config:
       for help. Psychological egoism suggests that one behaves based on what makes
       one feels good, hence it is a claim about human nature and how humans are capable
       of behaving. The answer is (C).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_philosophy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
index 49cbb91f..6f9e5d81 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml
@@ -54,6 +54,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on prehistory
       for help. Pacal built the temples as the funerary monument to legitimize his
       kingship. The answer is (D).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_prehistory
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
index 81073c1c..8d18fc22 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml
@@ -58,6 +58,6 @@ fewshot_config:
       for help. Among the four transactions, only Proceeds from long-term debt belongs
       to the financing activities section of cashflow, hence the amount reported should
       be $100000. The answer is (D).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_professional_accounting
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
index 9a4a17d3..307f8940 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml
@@ -117,6 +117,6 @@ fewshot_config:
       a due process clause. Hence the strongest argument should be the statute is
       overbroad and consequently invalid under the First and Fourteenth Amendments.
       The answer is (D).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_professional_law
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
index 9033f07e..4a5b0d99 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml
@@ -77,6 +77,6 @@ fewshot_config:
       for help. The symptoms and the adrenal mass suggested pheochromocytoma, and
       the blood pressure indicates hypertension. Phenoxybenzamine is used to treat
       hypertension caused by pheochromocytoma. The answer is (D).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_professional_medicine
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
index 772c8f47..60b5da68 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml
@@ -57,6 +57,6 @@ fewshot_config:
       for help. Based on the circumstances, you should tell your client about the
       pros and cons of each program, but it would be inappropriate to receive the
       bonus, so you should not claim the $50 bonus. The answer is (D).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_professional_psychology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
index f7fbabcf..fe384b1e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml
@@ -50,6 +50,6 @@ fewshot_config:
       for help. If a public relations media practitioner does not know the answer
       to a reporter''s question, they should say ''I don''t know'' and offer to provide
       the information later. The answer is (C).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_public_relations
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
index d1b3092d..b37e35b3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml
@@ -99,6 +99,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on security
       studies for help. Coercive diplomacy uses the threat of force to induce the
       opponent to comply with demands. The answer is (B).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_security_studies
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
index 0b7e1c05..4229d647 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml
@@ -53,6 +53,6 @@ fewshot_config:
       for help. The post-war welfare state of 1948 aimed to provide free healthcare
       and education, full employment, and universal welfare. But it did not aim to
       provide a minimum wage. The answer is (B).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_sociology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
index 3421476c..bc86b7c8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml
@@ -51,6 +51,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on us foreign
       policy for help. The 2008 financial crisis damanged the international reputation
       of the American model of political economy and capitalism. The answer is (A).'
-group: mmlu_flan_cot_fewshot_social_sciences
+tag: mmlu_flan_cot_fewshot_social_sciences
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_us_foreign_policy
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
index 700ce10d..0e70f0ee 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml
@@ -40,6 +40,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on virology
       for help. Paroviruses are highly impactful because they do not have nucleic
       acid. The answer is (A).'
-group: mmlu_flan_cot_fewshot_other
+tag: mmlu_flan_cot_fewshot_other
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_virology
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
index d9397235..41502cc7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml
@@ -37,6 +37,6 @@ fewshot_config:
     target: 'Let''s think step by step. We refer to Wikipedia articles on world religions
       for help. In Judaism, the most distinctive sign of the covenant is circumcision
       (brit milah). The answer is (B).'
-group: mmlu_flan_cot_fewshot_humanities
+tag: mmlu_flan_cot_fewshot_humanities
 include: _mmlu_flan_cot_fewshot_template_yaml
 task: mmlu_flan_cot_fewshot_world_religions
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
index 390425c7..5479813b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_flan_cot_zeroshot
+group_alias: mmlu (flan style, zeroshot cot)
 task:
-  - mmlu_flan_cot_zeroshot_stem
-  - mmlu_flan_cot_zeroshot_other
-  - mmlu_flan_cot_zeroshot_social_sciences
-  - mmlu_flan_cot_zeroshot_humanities
+  - group: stem
+    task:
+      - mmlu_flan_cot_zeroshot_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_flan_cot_zeroshot_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_flan_cot_zeroshot_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_flan_cot_zeroshot_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
index 8609f626..5e885b81 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
index 2923349d..7f17410a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
index e5ffd8ff..b5b821f9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
index a6428571..b26c679e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
index e3655230..3c0e9d17 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
index 736bb6de..de020f4e 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
index 7b719966..b8e5bbcf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
index 185f2a66..04b5e750 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
index 210eb127..81c59cc2 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
index 51c8a3c0..0450a068 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
index 319c7214..82c2bb2a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
index ae4bda96..78216a44 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
index 2e1e43db..52304bdf 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
index 9ff25bba..c5be81c4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
index ca10a43e..934a1a20 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
index 065c92d2..96ec81d6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
index ec2d323c..915c96de 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
index b1e29a3e..8a1f7491 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
index 0e5794db..5c4043d9 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
index eba398b0..5aee8915 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
index 4a69dbb3..eb3eb213 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
index 54eafb51..6fc261e8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
index 0898c876..baabc83a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
index d82fb6b0..41365c50 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
index b94fc2a6..05e62fa8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
index dff0960a..c9a9ca3b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
index 75a08c48..2fb86390 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
index 177d42da..c149ef08 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
index d5d47723..999f9be7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
index b245cf9e..a0f90556 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
index 2e187da2..1d09cdca 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
index c89dd0fa..28a63b1b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
index 230781b4..5a71bfc3 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
index ed2116dd..fa9b895b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
index d777e9fc..33766a46 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
index 62b86dd0..642e6ce4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
index 07ae8438..12594895 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
index cd4813ef..0c27feea 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_flan_cot_zeroshot_stem"
+"tag": "mmlu_flan_cot_zeroshot_stem"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
index b7164c1c..f1a13763 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_management"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
index 0827f78d..0fe6e44b 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
index 1706ee5b..813b6a3f 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
index 295d801a..c2a95e89 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
index a4595f06..a6a76a2a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
index a0e41ae4..a5f8c4e6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
index 4c87be43..f0f144cb 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
index 534707cb..a4e4c0c4 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
index 3233ba4e..9db801a6 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
index 021090c6..e94bef05 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
index 73d115d7..25239d9a 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
index 47cf9573..4f961bff 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
index cc055d5b..48758ef7 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
index 14d02c3a..62a56a44 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
index cae551e2..062f4963 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
index 45b94193..36b47118 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
index 52e48277..c4afb8f8 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_flan_cot_zeroshot_social_sciences"
+"tag": "mmlu_flan_cot_zeroshot_social_sciences"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
index fda1af06..a8e42761 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_flan_cot_zeroshot_other"
+"tag": "mmlu_flan_cot_zeroshot_other"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_virology"
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
index 40518282..0eb04f31 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_flan_cot_zeroshot_humanities"
+"tag": "mmlu_flan_cot_zeroshot_humanities"
 "include": "_mmlu_flan_cot_zeroshot_template_yaml"
 "task": "mmlu_flan_cot_zeroshot_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
index 7705a171..4a3c513e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_flan_n_shot_generative
+group_alias: mmlu (flan style, generative)
 task:
-  - mmlu_flan_n_shot_generative_stem
-  - mmlu_flan_n_shot_generative_other
-  - mmlu_flan_n_shot_generative_social_sciences
-  - mmlu_flan_n_shot_generative_humanities
+  - group: stem
+    task:
+      - mmlu_flan_n_shot_generative_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_flan_n_shot_generative_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_flan_n_shot_generative_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_flan_n_shot_generative_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
index 28be9cfc..1ca98389 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -1,4 +1,3 @@
-group: mmlu_flan_n_shot_generative
 dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
 test_split: test
 fewshot_split: dev
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
index 40cced2c..3e602ee8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
index 606049a5..fa12cc8e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
index db5faa22..a4178654 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
index add2ffb4..4447d276 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
index e3f24569..38f79906 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
index 3772b0e6..f36eb1f5 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
index cedcf0cc..0eccce65 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
index a060903a..fd415aa1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
index 1899ce65..2d062721 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
index ab052dd6..edc660d9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
index 3b1e64e6..aac8f400 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
index cd312a93..178c4683 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
index 49e6b38c..e3cfbe62 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
index d9b4ebfc..ad8704e4 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
index e6efb0ef..56eeae01 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
index b33cf318..da3b3af2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
index 1c2ad3a1..2d3f4edc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
index a2352ff7..4feef189 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
index 0b51f34a..245d9be8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
index 0066ba77..34eb30d3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
index b0d4ef15..34250a6c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
index 4b17db63..42b7dd4d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
index 93f8de20..e67277aa 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
index 7ae12c17..84643a74 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
index 71d82259..eb083338 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
index 20d31e12..f1ca028d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
index 5c6d6ef9..c60982b7 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
index 5b016778..33b8d167 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
index 1abf244c..f47bbbb6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
index df3e8d93..74197189 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
index 68e3f0a9..48696971 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
index dfb839c3..ae6cfcbb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
index a857698f..677f119a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
index 3dde3af5..d4e33d7d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
index be9018cf..ac5d9d5a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
index f6b3c7ae..c2f13586 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
index c6363390..6624e077 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
index 64496cfb..ab6c459a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_flan_n_shot_generative_stem"
+"tag": "mmlu_flan_n_shot_generative_stem"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
index 63292cc1..4af9ded0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
index 0716dc14..22ef9d3f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
index 92115979..c24da793 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
index 74e88944..c5b90845 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
index 58bf43df..295c39a6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
index 9630e517..f09f982f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
index df14da9d..cf633f27 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
index 20f5d60b..6a5fe27e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
index 3695e770..60788fc6 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
index 222642ac..f312af23 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
index b4d39e49..be0533f0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
index c420d0d3..9cae6f8a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
index c5ba7495..21a39c51 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
index 9aa7d686..b2687d99 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
index 6d2e0cda..6c36a552 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
index 3c42d0b9..7ce08099 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
index 5c514725..56ed5e16 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_flan_n_shot_generative_social_sciences"
+"tag": "mmlu_flan_n_shot_generative_social_sciences"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
index fb083b62..257dcfbf 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_flan_n_shot_generative_other"
+"tag": "mmlu_flan_n_shot_generative_other"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
index 0f2c199a..39b64d03 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_flan_n_shot_generative_humanities"
+"tag": "mmlu_flan_n_shot_generative_humanities"
 "include": "_mmlu_flan_generative_template_yaml"
 "task": "mmlu_flan_n_shot_generative_world_religions"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
index 43c5f007..cecfb488 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_flan_n_shot_loglikelihood
+group_alias: mmlu (flan style, loglikelihood)
 task:
-  - mmlu_flan_n_shot_loglikelihood_stem
-  - mmlu_flan_n_shot_loglikelihood_other
-  - mmlu_flan_n_shot_loglikelihood_social_sciences
-  - mmlu_flan_n_shot_loglikelihood_humanities
+  - group: stem
+    task:
+      - mmlu_flan_n_shot_loglikelihood_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_flan_n_shot_loglikelihood_other
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_flan_n_shot_loglikelihood_social_sciences
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_flan_n_shot_loglikelihood_humanities
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
index 068898e4..f5dfa65d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
index db5fa24e..e837e5d8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_anatomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
index 5f71dbcf..43b9bc7e 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_astronomy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
index 54dc204d..2438e667 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_business_ethics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
index 121b3c22..82d66add 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
index cadb6fb7..15e6e75d 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
index 8989ed40..2b8c1bd3 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
index e4bdbdd6..1178c7b0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
index 17d0cb54..9776889b 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
index c45a6c9c..c8fdad90 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
index d325f979..77a89689 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_college_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
index 5b0a75ff..e787e517 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_computer_security"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
index 94b7eaf6..859e88e4 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
index 146d4847..0455a515 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_econometrics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
index 61cb27e2..b63e0617 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
index 39e10f85..79771d21 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
index 7fb8aa92..3e46d8e2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_formal_logic"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
index 5ffc9069..9e7aff59 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_global_facts"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
index 328b47f8..dfe33de2 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_biology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
index 35058375..661ea0ca 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
index cd2e1285..b271a661 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
index 62f9465f..f1a329eb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
index c2e8d83f..fe681101 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_geography"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
index 9b72fb19..d8a8f279 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
index abc4f87a..45664135 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
index 1ddd6df3..49903260 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
index d72fc2ac..394c1d77 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
index c09b2c1d..7f32ef2f 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_physics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
index 8d4bfdd4..9a9aac07 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
index bc0c9d1a..5e7e02af 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
index 0524e767..7bc84ea9 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
index 86647362..f25cf646 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
index 9d0a0179..c258f919 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_human_aging"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
index 96979867..1e192a78 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_human_sexuality"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
index 3d6eb6b2..662bf6eb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_international_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
index 0ef1cb18..82036dc1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_jurisprudence"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
index b735d073..346e4b66 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
index fccc7058..3d7c2801 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_stem"
+"tag": "mmlu_flan_n_shot_loglikelihood_stem"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_machine_learning"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
index a40da661..7a732a77 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_management"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
index 3537a86b..56760226 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_marketing"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
index 49247525..6635c961 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_medical_genetics"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
index c6aa9baf..ce8dff42 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_miscellaneous"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
index 4ff46f42..62460e82 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_moral_disputes"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
index cdcc3b01..408c69f1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
index 5773ca1f..5494f9dc 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_nutrition"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
index 944b44a1..4fd1f01a 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_philosophy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
index 184a9584..1eb08bfb 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_prehistory"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
index 129b7a50..5a23a990 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_professional_accounting"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
index d28f1935..0a80f2ba 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_professional_law"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
index 68c8dc46..da9e30e1 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_professional_medicine"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
index 72481ae0..ce7043a0 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_professional_psychology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
index 73280df5..debace7c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_public_relations"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
index 03b27c90..eb1f585c 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_security_studies"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
index 630d1692..0580f7ae 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_sociology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
index 2274d067..3ff2d9ea 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_social_sciences"
+"tag": "mmlu_flan_n_shot_loglikelihood_social_sciences"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
index 8bd36105..c3edfd95 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_flan_n_shot_loglikelihood_other"
+"tag": "mmlu_flan_n_shot_loglikelihood_other"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_virology"
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
index e59c2849..765e70c8 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml
@@ -1,6 +1,6 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_flan_n_shot_loglikelihood_humanities"
+"tag": "mmlu_flan_n_shot_loglikelihood_humanities"
 "include": "_mmlu_flan_loglikelihood_template_yaml"
 "task": "mmlu_flan_n_shot_loglikelihood_world_religions"
diff --git a/lm_eval/tasks/mmlu/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/generative/_mmlu.yaml
index 15ead35d..0b508d96 100644
--- a/lm_eval/tasks/mmlu/generative/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/generative/_mmlu.yaml
@@ -1,6 +1,32 @@
 group: mmlu_generative
+group_alias: mmlu (generative)
 task:
-  - mmlu_stem_generative
-  - mmlu_other_generative
-  - mmlu_social_sciences_generative
-  - mmlu_humanities_generative
+  - group: stem
+    task:
+      - mmlu_stem_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: other
+    task:
+      - mmlu_other_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: social sciences
+    task:
+      - mmlu_social_sciences_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+  - group: humanities
+    task:
+      - mmlu_humanities_generative
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml b/lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml
index bbdfee54..17bfcafb 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
   \ algebra.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra_generative"
 "task_alias": "abstract_algebra"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml b/lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml
index c8870b45..72afc359 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "anatomy"
 "description": "The following are multiple choice questions (with answers) about anatomy.\n\
   \n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_anatomy_generative"
 "task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml b/lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml
index 53aa3803..0b41447e 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "astronomy"
 "description": "The following are multiple choice questions (with answers) about astronomy.\n\
   \n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_astronomy_generative"
 "task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml
index 9a4f183b..e7c15d44 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "business_ethics"
 "description": "The following are multiple choice questions (with answers) about business\
   \ ethics.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_business_ethics_generative"
 "task_alias": "business_ethics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml b/lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml
index c55259ad..24cd0b72 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "clinical_knowledge"
 "description": "The following are multiple choice questions (with answers) about clinical\
   \ knowledge.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_clinical_knowledge_generative"
 "task_alias": "clinical_knowledge"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml
index 62306d79..2ff9cc28 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_biology"
 "description": "The following are multiple choice questions (with answers) about college\
   \ biology.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_biology_generative"
 "task_alias": "college_biology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml
index 5da9c041..12d9ce3e 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_chemistry"
 "description": "The following are multiple choice questions (with answers) about college\
   \ chemistry.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_chemistry_generative"
 "task_alias": "college_chemistry"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml
index af57ed39..73d91c52 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_computer_science"
 "description": "The following are multiple choice questions (with answers) about college\
   \ computer science.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_computer_science_generative"
 "task_alias": "college_computer_science"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml
index 4cf22a4f..15ae9dde 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_mathematics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ mathematics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_mathematics_generative"
 "task_alias": "college_mathematics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml
index 579085df..0461ab7a 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_medicine"
 "description": "The following are multiple choice questions (with answers) about college\
   \ medicine.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_medicine_generative"
 "task_alias": "college_medicine"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml
index b281f7cd..0d997d89 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "college_physics"
 "description": "The following are multiple choice questions (with answers) about college\
   \ physics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_college_physics_generative"
 "task_alias": "college_physics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml b/lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml
index 0b75f49e..ee64d201 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "computer_security"
 "description": "The following are multiple choice questions (with answers) about computer\
   \ security.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_computer_security_generative"
 "task_alias": "computer_security"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml
index 44b95807..75764a2c 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "conceptual_physics"
 "description": "The following are multiple choice questions (with answers) about conceptual\
   \ physics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_conceptual_physics_generative"
 "task_alias": "conceptual_physics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml
index 3aa00726..43fec80a 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "econometrics"
 "description": "The following are multiple choice questions (with answers) about econometrics.\n\
   \n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_econometrics_generative"
 "task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml b/lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml
index b7393290..130ec2b2 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "electrical_engineering"
 "description": "The following are multiple choice questions (with answers) about electrical\
   \ engineering.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_electrical_engineering_generative"
 "task_alias": "electrical_engineering"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml
index fa364774..4afd087d 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "elementary_mathematics"
 "description": "The following are multiple choice questions (with answers) about elementary\
   \ mathematics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_elementary_mathematics_generative"
 "task_alias": "elementary_mathematics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml b/lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml
index cf8d3ce6..72c28c0b 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "formal_logic"
 "description": "The following are multiple choice questions (with answers) about formal\
   \ logic.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_formal_logic_generative"
 "task_alias": "formal_logic"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml b/lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml
index 6e49d4f2..b788025a 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "global_facts"
 "description": "The following are multiple choice questions (with answers) about global\
   \ facts.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_global_facts_generative"
 "task_alias": "global_facts"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml
index fd88cb76..3677842d 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_biology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school biology.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_biology_generative"
 "task_alias": "high_school_biology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml
index 91cc4bda..2df93cab 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_chemistry"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school chemistry.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_chemistry_generative"
 "task_alias": "high_school_chemistry"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml
index d5b8d529..ec5dc7f8 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_computer_science"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school computer science.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_computer_science_generative"
 "task_alias": "high_school_computer_science"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml
index 1ad133fc..9732754b 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_european_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school european history.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_european_history_generative"
 "task_alias": "high_school_european_history"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml
index 56d6449f..66b1a3c9 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_geography"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school geography.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_geography_generative"
 "task_alias": "high_school_geography"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml
index a4cc73ca..46861fdc 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_government_and_politics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school government and politics.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_government_and_politics_generative"
 "task_alias": "high_school_government_and_politics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml
index 57496b74..ada41592 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_macroeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school macroeconomics.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_macroeconomics_generative"
 "task_alias": "high_school_macroeconomics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml
index 26dbffe0..8b22a588 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_mathematics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school mathematics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_mathematics_generative"
 "task_alias": "high_school_mathematics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml
index ed9e0dcb..c59ff162 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_microeconomics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school microeconomics.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_microeconomics_generative"
 "task_alias": "high_school_microeconomics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml
index 080aed1a..21d846af 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_physics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school physics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_physics_generative"
 "task_alias": "high_school_physics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml
index 131c4365..cd1321a5 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_psychology"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school psychology.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_psychology_generative"
 "task_alias": "high_school_psychology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml
index 9470e01a..f1442fb8 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_statistics"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school statistics.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_statistics_generative"
 "task_alias": "high_school_statistics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml
index 2d5b1c18..4552a560 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_us_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school us history.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_us_history_generative"
 "task_alias": "high_school_us_history"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml b/lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml
index 0ee7f993..d510f22f 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "high_school_world_history"
 "description": "The following are multiple choice questions (with answers) about high\
   \ school world history.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_high_school_world_history_generative"
 "task_alias": "high_school_world_history"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml b/lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml
index ac19aaed..56352f4a 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "human_aging"
 "description": "The following are multiple choice questions (with answers) about human\
   \ aging.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_human_aging_generative"
 "task_alias": "human_aging"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml b/lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml
index 360075f2..a23559cf 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "human_sexuality"
 "description": "The following are multiple choice questions (with answers) about human\
   \ sexuality.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_human_sexuality_generative"
 "task_alias": "human_sexuality"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml b/lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml
index 94df3368..878df6f3 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "international_law"
 "description": "The following are multiple choice questions (with answers) about international\
   \ law.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_international_law_generative"
 "task_alias": "international_law"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml b/lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml
index c4e2f91c..c5782d81 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "jurisprudence"
 "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
   \n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_jurisprudence_generative"
 "task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml b/lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml
index 72197f51..43e8e016 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "logical_fallacies"
 "description": "The following are multiple choice questions (with answers) about logical\
   \ fallacies.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_logical_fallacies_generative"
 "task_alias": "logical_fallacies"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml b/lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml
index b1b33678..8d39a4b5 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "machine_learning"
 "description": "The following are multiple choice questions (with answers) about machine\
   \ learning.\n\n"
-"group": "mmlu_stem_generative"
-"group_alias": "stem"
+"tag": "mmlu_stem_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_machine_learning_generative"
 "task_alias": "machine_learning"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_management.yaml b/lm_eval/tasks/mmlu/generative/mmlu_management.yaml
index 9e1f7040..6d51ea0d 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_management.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_management.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "management"
 "description": "The following are multiple choice questions (with answers) about management.\n\
   \n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_management_generative"
 "task_alias": "management"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml b/lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml
index 05b56877..744385a2 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "marketing"
 "description": "The following are multiple choice questions (with answers) about marketing.\n\
   \n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_marketing_generative"
 "task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml b/lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml
index a8208145..7fea5795 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "medical_genetics"
 "description": "The following are multiple choice questions (with answers) about medical\
   \ genetics.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_medical_genetics_generative"
 "task_alias": "medical_genetics"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml b/lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml
index 86952e80..e7e0fabc 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "miscellaneous"
 "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
   \n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_miscellaneous_generative"
 "task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml b/lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml
index 3e2396d0..61d2feee 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "moral_disputes"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ disputes.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_disputes_generative"
 "task_alias": "moral_disputes"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml b/lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml
index 694c2cfe..2aeb93f9 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "moral_scenarios"
 "description": "The following are multiple choice questions (with answers) about moral\
   \ scenarios.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_moral_scenarios_generative"
 "task_alias": "moral_scenarios"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml b/lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml
index 835df556..638ac810 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "nutrition"
 "description": "The following are multiple choice questions (with answers) about nutrition.\n\
   \n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_nutrition_generative"
 "task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml b/lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml
index 8423ea12..149894b8 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "philosophy"
 "description": "The following are multiple choice questions (with answers) about philosophy.\n\
   \n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_philosophy_generative"
 "task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml b/lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml
index e9ef7f57..e130e1ba 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "prehistory"
 "description": "The following are multiple choice questions (with answers) about prehistory.\n\
   \n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_prehistory_generative"
 "task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml b/lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml
index d809d830..a46792ec 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_accounting"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ accounting.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_accounting_generative"
 "task_alias": "professional_accounting"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml b/lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml
index c6e16833..f087657e 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_law"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ law.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_law_generative"
 "task_alias": "professional_law"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml b/lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml
index 8f69a42f..bc808789 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_medicine"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ medicine.\n\n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_medicine_generative"
 "task_alias": "professional_medicine"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml
index c63f20b4..d0b36ccd 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "professional_psychology"
 "description": "The following are multiple choice questions (with answers) about professional\
   \ psychology.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_professional_psychology_generative"
 "task_alias": "professional_psychology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml b/lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml
index 5a8e9173..37cdccba 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "public_relations"
 "description": "The following are multiple choice questions (with answers) about public\
   \ relations.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_public_relations_generative"
 "task_alias": "public_relations"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml b/lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml
index 67208998..36c235fe 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "security_studies"
 "description": "The following are multiple choice questions (with answers) about security\
   \ studies.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_security_studies_generative"
 "task_alias": "security_studies"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml
index 2d4474a8..b7e2e592 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "sociology"
 "description": "The following are multiple choice questions (with answers) about sociology.\n\
   \n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_sociology_generative"
 "task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml b/lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml
index db5cf05f..d5fb9536 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "us_foreign_policy"
 "description": "The following are multiple choice questions (with answers) about us\
   \ foreign policy.\n\n"
-"group": "mmlu_social_sciences_generative"
-"group_alias": "social_sciences"
+"tag": "mmlu_social_sciences_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_us_foreign_policy_generative"
 "task_alias": "us_foreign_policy"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_virology.yaml b/lm_eval/tasks/mmlu/generative/mmlu_virology.yaml
index 470ed66e..9954dc18 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_virology.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_virology.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "virology"
 "description": "The following are multiple choice questions (with answers) about virology.\n\
   \n"
-"group": "mmlu_other_generative"
-"group_alias": "other"
+"tag": "mmlu_other_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_virology_generative"
 "task_alias": "virology"
diff --git a/lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml b/lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml
index bbaf8102..1db5128b 100644
--- a/lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml
+++ b/lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml
@@ -1,8 +1,7 @@
 "dataset_name": "world_religions"
 "description": "The following are multiple choice questions (with answers) about world\
   \ religions.\n\n"
-"group": "mmlu_humanities_generative"
-"group_alias": "humanities"
+"tag": "mmlu_humanities_generative"
 "include": "_default_template_yaml"
 "task": "mmlu_world_religions_generative"
 "task_alias": "world_religions"
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
index b8bddc33..581c3150 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -1,4 +1,4 @@
-group: advanced_ai_risk
+tag: advanced_ai_risk
 dataset_path: EleutherAI/advanced_ai_risk
 output_type: multiple_choice
 validation_split: validation
diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
index 5702df8e..13023348 100644
--- a/lm_eval/tasks/model_written_evals/persona/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -1,4 +1,4 @@
-group: persona
+tag: persona
 dataset_path: EleutherAI/persona
 output_type: multiple_choice
 validation_split: validation
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
index e8e9b865..c7b546e1 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -1,4 +1,4 @@
-group: sycophancy
+tag: sycophancy
 task: sycophancy_on_nlp_survey
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_nlp_survey
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
index f726d6ce..a89f3eda 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -1,4 +1,4 @@
-group: sycophancy
+tag: sycophancy
 task: sycophancy_on_philpapers2020
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_philpapers2020
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
index 95b3280b..30b82601 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -1,4 +1,4 @@
-group: sycophancy
+tag: sycophancy
 task: sycophancy_on_political_typology_quiz
 dataset_path: EleutherAI/sycophancy
 dataset_name: sycophancy_on_political_typology_quiz
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/winogenerated.yaml
similarity index 93%
rename from lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
rename to lm_eval/tasks/model_written_evals/winogenerated/winogenerated.yaml
index 6b167889..3f7db34f 100644
--- a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/winogenerated/winogenerated.yaml
@@ -1,4 +1,4 @@
-group: winogenerated
+tag: winogenerated
 dataset_path: EleutherAI/winogenerated
 output_type: multiple_choice
 validation_split: validation
diff --git a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
index db1c764b..ca70b369 100644
--- a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - arc_multilingual
 dataset_path: null
 dataset_name: null
diff --git a/lm_eval/tasks/okapi/hellaswag_multilingual/_hellaswag_yaml b/lm_eval/tasks/okapi/hellaswag_multilingual/_hellaswag_yaml
index 5be1d03a..f55801f8 100644
--- a/lm_eval/tasks/okapi/hellaswag_multilingual/_hellaswag_yaml
+++ b/lm_eval/tasks/okapi/hellaswag_multilingual/_hellaswag_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - hellaswag_multilingual
 dataset_path: null
 dataset_name: null
diff --git a/lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml b/lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
index 7a61ba4f..aad3a411 100644
--- a/lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
+++ b/lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - m_mmlu
 dataset_path: alexandrainst/m_mmlu
 test_split: test
diff --git a/lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc1_yaml b/lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc1_yaml
index 672b6088..f7e0f25f 100644
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc1_yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc1_yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - truthfulqa_multilingual
 dataset_path: null
 dataset_name: null
diff --git a/lm_eval/tasks/paloma/_paloma_template b/lm_eval/tasks/paloma/_paloma_template
index 84983e58..3f88e81d 100644
--- a/lm_eval/tasks/paloma/_paloma_template
+++ b/lm_eval/tasks/paloma/_paloma_template
@@ -1,4 +1,4 @@
-group:
+tag:
   - paloma
 dataset_path: allenai/paloma
 output_type: loglikelihood_rolling
diff --git a/lm_eval/tasks/paws-x/_pawsx.yaml b/lm_eval/tasks/paws-x/_pawsx.yaml
new file mode 100644
index 00000000..6377e05c
--- /dev/null
+++ b/lm_eval/tasks/paws-x/_pawsx.yaml
@@ -0,0 +1,15 @@
+group: pawsx
+task:
+  - paws_en
+  - paws_de
+  - paws_es
+  - paws_fr
+  - paws_ja
+  - paws_ko
+  - paws_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/paws-x/pawsx_template_yaml b/lm_eval/tasks/paws-x/pawsx_template_yaml
index 47564738..dfdaae27 100644
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
@@ -1,7 +1,6 @@
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: pawsx
 task: null
 dataset_path: paws-x
 dataset_name: null
diff --git a/lm_eval/tasks/pile/pile_arxiv.yaml b/lm_eval/tasks/pile/pile_arxiv.yaml
index 58760cc8..f282169e 100644
--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
@@ -1,5 +1,3 @@
-group:
-  - pile
 task: pile_arxiv
 dataset_path: EleutherAI/pile
 dataset_name: pile_arxiv
diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml
index 2fa16db8..c667cf6e 100644
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - polemo2
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
diff --git a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
index b9ceb780..5b134fd9 100644
--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - qa4mre
 task: qa4mre_2011
 dataset_path: qa4mre
diff --git a/lm_eval/tasks/qasper/bool.yaml b/lm_eval/tasks/qasper/bool.yaml
index 17d3f1be..52305c12 100644
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
@@ -1,4 +1,4 @@
-group: qasper
+tag: qasper
 task: qasper_bool
 dataset_path: allenai/qasper
 output_type: multiple_choice
diff --git a/lm_eval/tasks/qasper/freeform.yaml b/lm_eval/tasks/qasper/freeform.yaml
index ed7a4bc4..318fb430 100644
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
@@ -1,4 +1,4 @@
-group: qasper
+tag: qasper
 task: qasper_freeform
 dataset_path: allenai/qasper
 output_type: generate_until
diff --git a/lm_eval/tasks/squad_completion/task.py b/lm_eval/tasks/squad_completion/task.py
index c2ecef19..7603a986 100644
--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
@@ -12,7 +12,7 @@ class SQUADCompletion(ConfigurableTask):
     DATASET_PATH = "hazyresearch/based-squad"
     DATASET_NAME = "default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(config={"metadata": {"version": self.VERSION}})
 
     def has_training_docs(self):
diff --git a/lm_eval/tasks/storycloze/storycloze_2016.yaml b/lm_eval/tasks/storycloze/storycloze_2016.yaml
index df1c2629..2d0f9222 100644
--- a/lm_eval/tasks/storycloze/storycloze_2016.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2016.yaml
@@ -1,4 +1,4 @@
-group: storycloze
+tag: storycloze
 task: storycloze_2016
 dataset_path: story_cloze
 dataset_name: 2016
diff --git a/lm_eval/tasks/storycloze/storycloze_2018.yaml b/lm_eval/tasks/storycloze/storycloze_2018.yaml
index aa630efc..3ac23830 100644
--- a/lm_eval/tasks/storycloze/storycloze_2018.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2018.yaml
@@ -1,4 +1,4 @@
-group: storycloze
+tag: storycloze
 task: storycloze_2018
 dataset_path: story_cloze
 dataset_name: 2018
diff --git a/lm_eval/tasks/super_glue/README.md b/lm_eval/tasks/super_glue/README.md
index c8e80771..868b3a93 100644
--- a/lm_eval/tasks/super_glue/README.md
+++ b/lm_eval/tasks/super_glue/README.md
@@ -26,10 +26,14 @@ Homepage: https://super.gluebenchmark.com/
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
+None.
+
+#### Tags
+
 * `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
 * `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
 
diff --git a/lm_eval/tasks/super_glue/boolq/default.yaml b/lm_eval/tasks/super_glue/boolq/default.yaml
index f26e4682..1123761d 100644
--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: boolq
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
index 569316cb..08211ec1 100644
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1-seq2seq
 task: "boolq-seq2seq"
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
index 7089381a..b3ceb78d 100644
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-boolq-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/cb/default.yaml b/lm_eval/tasks/super_glue/cb/default.yaml
index c575e987..550635ed 100644
--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: cb
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
index 984e1793..8b6f512c 100644
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-cb-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/copa/default.yaml b/lm_eval/tasks/super_glue/copa/default.yaml
index 1af5dbf4..9e2d1a2f 100644
--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: copa
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
index 20a90db9..7358b908 100644
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-copa-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/multirc/default.yaml b/lm_eval/tasks/super_glue/multirc/default.yaml
index 5a388299..c14f7040 100644
--- a/lm_eval/tasks/super_glue/multirc/default.yaml
+++ b/lm_eval/tasks/super_glue/multirc/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: multirc
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
index 927a3571..566a65cc 100644
--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-multirc-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/record/default.yaml b/lm_eval/tasks/super_glue/record/default.yaml
index ca978fd2..9dc924fc 100644
--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: record
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
index c999bc90..f565171f 100644
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-record-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/rte/default.yaml b/lm_eval/tasks/super_glue/rte/default.yaml
index 6754af1a..b660f36d 100644
--- a/lm_eval/tasks/super_glue/rte/default.yaml
+++ b/lm_eval/tasks/super_glue/rte/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: sglue_rte
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
index 9e80686e..27caab0d 100644
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-rte-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/wic/default.yaml b/lm_eval/tasks/super_glue/wic/default.yaml
index 0f86855a..4bb8ed59 100644
--- a/lm_eval/tasks/super_glue/wic/default.yaml
+++ b/lm_eval/tasks/super_glue/wic/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: "wic"
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
index 3a0dbb2f..b642a70b 100644
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-wic-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/wsc/default.yaml b/lm_eval/tasks/super_glue/wsc/default.yaml
index b9c7ec34..2cd12679 100644
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-lm-eval-v1
 task: wsc
 dataset_path: super_glue
diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
index 6030d1fa..77bfe7d0 100644
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - super-glue-t5-prompt
 task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
diff --git a/lm_eval/tasks/swde/task.py b/lm_eval/tasks/swde/task.py
index 41391162..7226364b 100644
--- a/lm_eval/tasks/swde/task.py
+++ b/lm_eval/tasks/swde/task.py
@@ -12,7 +12,7 @@ class SWDE(ConfigurableTask):
     DATASET_PATH = "hazyresearch/based-swde-v2"
     DATASET_NAME = "default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(config={"metadata": {"version": self.VERSION}})
 
     def has_training_docs(self):
diff --git a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
index ea713393..824f4eba 100644
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
 doc_to_text: 'Arabic phrase: {{translation["ar"]}}
 
   English phrase:'
-group:
-- generate_until
+tag:
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
index 891ad50f..c4b124b0 100644
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ar"]}}'
 doc_to_text: 'English phrase: {{translation["en"]}}
 
   Arabic phrase:'
-group:
-- generate_until
+tag:
 - translation
 - iwslt2017
 include: wmt_common_yaml
diff --git a/lm_eval/tasks/translation/wmt14_en-fr.yaml b/lm_eval/tasks/translation/wmt14_en-fr.yaml
index b7e42dca..dd5f3ce0 100644
--- a/lm_eval/tasks/translation/wmt14_en-fr.yaml
+++ b/lm_eval/tasks/translation/wmt14_en-fr.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["fr"]}}'
 doc_to_text: 'English phrase: {{translation["en"]}}
 
   French phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt14_fr-en.yaml b/lm_eval/tasks/translation/wmt14_fr-en.yaml
index 09ddd57d..d91fed41 100644
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
 doc_to_text: 'French phrase: {{translation["fr"]}}
 
   English phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt14
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_de-en.yaml b/lm_eval/tasks/translation/wmt16_de-en.yaml
index 23d50e4a..d5ac1805 100644
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
 doc_to_text: 'German phrase: {{translation["de"]}}
 
   English phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-de.yaml b/lm_eval/tasks/translation/wmt16_en-de.yaml
index 8d391b6c..d52ab498 100644
--- a/lm_eval/tasks/translation/wmt16_en-de.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-de.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["de"]}}'
 doc_to_text: 'English phrase: {{translation["en"]}}
 
   German phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_en-ro.yaml b/lm_eval/tasks/translation/wmt16_en-ro.yaml
index 45a8cae1..096f8743 100644
--- a/lm_eval/tasks/translation/wmt16_en-ro.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-ro.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ro"]}}'
 doc_to_text: 'English phrase: {{translation["en"]}}
 
   Romanian phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/translation/wmt16_ro-en.yaml b/lm_eval/tasks/translation/wmt16_ro-en.yaml
index 39441eac..9e338347 100644
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
 doc_to_text: 'Romanian phrase: {{translation["ro"]}}
 
   English phrase:'
-group:
-- generate_until
+tag:
 - translation
 - wmt16
 - gpt3_translation_benchmarks
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
index e83a68a3..ead101df 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - truthfulqa
 task: truthfulqa_gen
 dataset_path: truthful_qa
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
index d9d3a696..5ea8a286 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - truthfulqa
 task: truthfulqa_mc1
 dataset_path: truthful_qa
diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml
index 392fc78f..ee044474 100644
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - unscramble
 task: anagrams1
 dataset_path: EleutherAI/unscramble
diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml
index 9f1dbe6f..cb5e91de 100644
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - unscramble
 task: anagrams2
 dataset_path: EleutherAI/unscramble
diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml
index dc7ccf97..b433b167 100644
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - unscramble
 task: cycle_letters
 dataset_path: EleutherAI/unscramble
diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml
index 189c2415..f537620d 100644
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - unscramble
 task: random_insertion
 dataset_path: EleutherAI/unscramble
diff --git a/lm_eval/tasks/unscramble/reversed_words.yaml b/lm_eval/tasks/unscramble/reversed_words.yaml
index d1bc37dc..b074de6e 100644
--- a/lm_eval/tasks/unscramble/reversed_words.yaml
+++ b/lm_eval/tasks/unscramble/reversed_words.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - unscramble
 task: reversed_words
 dataset_path: EleutherAI/unscramble
diff --git a/lm_eval/tasks/webqs/webqs.yaml b/lm_eval/tasks/webqs/webqs.yaml
index 71aaac1e..d725d1dd 100644
--- a/lm_eval/tasks/webqs/webqs.yaml
+++ b/lm_eval/tasks/webqs/webqs.yaml
@@ -1,4 +1,4 @@
-group:
+tag:
   - freebase
 task: webqs
 dataset_path: web_questions
diff --git a/lm_eval/tasks/wmdp/README.md b/lm_eval/tasks/wmdp/README.md
index f6074d47..aadde0c1 100644
--- a/lm_eval/tasks/wmdp/README.md
+++ b/lm_eval/tasks/wmdp/README.md
@@ -24,7 +24,7 @@ Homepage: https://wmdp.ai
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
diff --git a/lm_eval/tasks/wmdp/_default_template_yaml b/lm_eval/tasks/wmdp/_default_template_yaml
index 4e9d1c80..8320c677 100644
--- a/lm_eval/tasks/wmdp/_default_template_yaml
+++ b/lm_eval/tasks/wmdp/_default_template_yaml
@@ -1,5 +1,4 @@
 dataset_path: cais/wmdp
-group: wmdp
 test_split: test
 training_split: null
 validation_split: null
diff --git a/lm_eval/tasks/wmdp/_wmdp.yaml b/lm_eval/tasks/wmdp/_wmdp.yaml
new file mode 100644
index 00000000..406a2ee3
--- /dev/null
+++ b/lm_eval/tasks/wmdp/_wmdp.yaml
@@ -0,0 +1,11 @@
+group: wmdp
+task:
+  - wmdp_bio
+  - wmdp_chem
+  - wmdp_cyber
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
+metadata:
+  version: 0
diff --git a/lm_eval/tasks/wmt2016/README.md b/lm_eval/tasks/wmt2016/README.md
index 14e9edca..56b2e4ab 100644
--- a/lm_eval/tasks/wmt2016/README.md
+++ b/lm_eval/tasks/wmt2016/README.md
@@ -27,11 +27,7 @@ Homepage: https://huggingface.co/datasets/wmt16
 }
 ```
 
-### Groups and Tasks
-
-#### Groups
-
-* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
+### Groups, Tags, and Tasks
 
 #### Tasks
 
diff --git a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
index fae5db7e..5009d876 100644
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -1,5 +1,3 @@
-group:
-  - wmt-t5-prompt
 task: wmt-ro-en-t5-prompt
 dataset_path: wmt16
 dataset_name: ro-en
diff --git a/lm_eval/tasks/xcopa/_xcopa.yaml b/lm_eval/tasks/xcopa/_xcopa.yaml
new file mode 100644
index 00000000..c73141e6
--- /dev/null
+++ b/lm_eval/tasks/xcopa/_xcopa.yaml
@@ -0,0 +1,19 @@
+group: xcopa
+task:
+  - xcopa_et
+  - xcopa_ht
+  - xcopa_id
+  - xcopa_it
+  - xcopa_qu
+  - xcopa_sw
+  - xcopa_ta
+  - xcopa_th
+  - xcopa_tr
+  - xcopa_vi
+  - xcopa_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/xcopa/default_et.yaml b/lm_eval/tasks/xcopa/default_et.yaml
index 9f2b0b73..76e750ff 100644
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
@@ -1,4 +1,3 @@
-group: xcopa
 task: xcopa_et
 dataset_path: xcopa
 dataset_name: et
diff --git a/lm_eval/tasks/xnli/_xnli.yaml b/lm_eval/tasks/xnli/_xnli.yaml
new file mode 100644
index 00000000..f6e350c8
--- /dev/null
+++ b/lm_eval/tasks/xnli/_xnli.yaml
@@ -0,0 +1,23 @@
+group: xnli
+task:
+  - xnli_ar
+  - xnli_bg
+  - xnli_de
+  - xnli_el
+  - xnli_en
+  - xnli_es
+  - xnli_fr
+  - xnli_hi
+  - xnli_ru
+  - xnli_sw
+  - xnli_th
+  - xnli_tr
+  - xnli_ur
+  - xnli_vi
+  - xnli_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/xnli/xnli_common_yaml b/lm_eval/tasks/xnli/xnli_common_yaml
index 0201459d..4dc4ab4f 100644
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
@@ -1,7 +1,6 @@
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: xnli
 task: null
 dataset_path: xnli
 dataset_name: null
diff --git a/lm_eval/tasks/xnli_eu/README.md b/lm_eval/tasks/xnli_eu/README.md
index 4efffa3c..ce646d4d 100644
--- a/lm_eval/tasks/xnli_eu/README.md
+++ b/lm_eval/tasks/xnli_eu/README.md
@@ -24,9 +24,9 @@ Homepage: https://github.com/hitz-zentroa/xnli-eu
 }
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
-#### Groups
+#### Tags
 
 * `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.
 
diff --git a/lm_eval/tasks/xnli_eu/xnli_common_yaml b/lm_eval/tasks/xnli_eu/xnli_common_yaml
index fe2a43af..4950a899 100644
--- a/lm_eval/tasks/xnli_eu/xnli_common_yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_common_yaml
@@ -1,4 +1,3 @@
-group: xnli
 task: null
 dataset_path: xnli
 dataset_name: null
diff --git a/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml b/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
index 4674157c..c0fbf541 100644
--- a/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
@@ -1,4 +1,4 @@
 include: xnli_eu.yaml
-group: xnli_eu_mt_native
+tag: xnli_eu_mt_native
 task: xnli_eu_mt
 dataset_name: eu_mt
diff --git a/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml b/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
index 86e5eb06..e841f37e 100644
--- a/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
@@ -1,5 +1,5 @@
 include: xnli_eu.yaml
-group: xnli_eu_mt_native
+tag: xnli_eu_mt_native
 task: xnli_eu_native
 training_split: null
 validation_split: null
diff --git a/lm_eval/tasks/xstorycloze/_xstorycloze.yaml b/lm_eval/tasks/xstorycloze/_xstorycloze.yaml
new file mode 100644
index 00000000..0b9b41be
--- /dev/null
+++ b/lm_eval/tasks/xstorycloze/_xstorycloze.yaml
@@ -0,0 +1,19 @@
+group: xstorycloze
+task:
+  - xstorycloze_ar
+  - xstorycloze_en
+  - xstorycloze_es
+  - xstorycloze_eu
+  - xstorycloze_hi
+  - xstorycloze_id
+  - xstorycloze_my
+  - xstorycloze_ru
+  - xstorycloze_sw
+  - xstorycloze_te
+  - xstorycloze_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/xstorycloze/default_ar.yaml b/lm_eval/tasks/xstorycloze/default_ar.yaml
index 2a52966d..836a4cb0 100644
--- a/lm_eval/tasks/xstorycloze/default_ar.yaml
+++ b/lm_eval/tasks/xstorycloze/default_ar.yaml
@@ -1,4 +1,3 @@
-group: xstorycloze
 task: xstorycloze_ar
 dataset_path: juletxara/xstory_cloze
 dataset_name: ar
diff --git a/lm_eval/tasks/xwinograd/_xwinograd.yaml b/lm_eval/tasks/xwinograd/_xwinograd.yaml
new file mode 100644
index 00000000..24fbcdef
--- /dev/null
+++ b/lm_eval/tasks/xwinograd/_xwinograd.yaml
@@ -0,0 +1,14 @@
+group: xwinograd
+task:
+  - xwinograd_en
+  - xwinograd_fr
+  - xwinograd_jp
+  - xwinograd_pt
+  - xwinograd_ru
+  - xwinograd_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/xwinograd/xwinograd_common_yaml b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
index 86554820..b3e68326 100644
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
@@ -1,8 +1,6 @@
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group:
-  - xwinograd
 dataset_path: Muennighoff/xwinograd
 dataset_name: null  # Overridden by language-specific config.
 output_type: multiple_choice
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 486f960f..62d500cc 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -308,7 +308,7 @@ class Reorderer:
         return res
 
 
-def make_table(result_dict, column: str = "results", sort_results: bool = True):
+def make_table(result_dict, column: str = "results", sort_results: bool = False):
     """Generate table of results."""
     from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
@@ -338,20 +338,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
 
     keys = result_dict[column].keys()
     if sort_results:
-        # sort entries alphabetically
+        # sort entries alphabetically by task or group name.
+        # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
+        # sorting here would mess that up
         keys = sorted(keys)
     for k in keys:
         dic = result_dict[column][k]
-        version = result_dict["versions"].get(k, "N/A")
-        n = str(result_dict["n-shot"][k])
+        version = result_dict["versions"].get(k, "    N/A")
+        n = str(result_dict.get("n-shot", " ").get(k, " "))
         higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
 
         if "alias" in dic:
             k = dic.pop("alias")
 
         metric_items = dic.items()
-        if sort_results:
-            metric_items = sorted(metric_items)
+        metric_items = sorted(metric_items)
 
         for (mf), v in metric_items:
             m, _, f = mf.partition(",")
@@ -362,8 +363,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
 
             if m + "_stderr" + "," + f in dic:
                 se = dic[m + "_stderr" + "," + f]
-                if se != "N/A":
-                    se = "%.4f" % se
+                se = "   N/A" if se == "N/A" else "%.4f" % se
                 values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
             else:
                 values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
diff --git a/templates/new_yaml_task/README.md b/templates/new_yaml_task/README.md
index 30287122..3b7259c7 100644
--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
 BibTeX-formatted citation goes here
 ```
 
-### Groups and Tasks
+### Groups, Tags, and Tasks
 
 #### Groups
 
 * `group_name`: `Short description`
 
+#### Tags
+
+* `tag_name`: `Short description`
+
 #### Tasks
 
 * `task_name`: `1-sentence description of what this particular task does`
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 71e25555..cc5f70fa 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -90,7 +90,7 @@ def test_evaluator(
             "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
         ),
         (
-            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            ["mmlu_stem"],
             10,
             "hf",
             "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
diff --git a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 79a69332..55ac2ccd 100644
--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,8 +1,6 @@
-|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
-|----------------|-------|------|-----:|--------|---|----:|---|------|
-|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
-| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
-| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
\ No newline at end of file
+|    Tasks    |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|-------------|------:|------|-----:|--------|---|----:|---|-----:|
+|arc_challenge|      1|none  |     0|acc     |↑  |  0.0|±  |0.0000|
+|             |       |none  |     0|acc_norm|↑  |  0.0|±  |0.0000|
+|arc_easy     |      1|none  |     0|acc     |↑  |  0.3|±  |0.1528|
+|             |       |none  |     0|acc_norm|↑  |  0.1|±  |0.1000|
\ No newline at end of file
diff --git a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 94bb6255..32fce941 100644
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,4 +1,4 @@
-|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
-|--------------|------:|------|-----:|----------|---|-------:|---|------|
-|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
-|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
\ No newline at end of file
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   | Stderr  |
+|--------------|------:|------|-----:|----------|---|-------:|---|--------:|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |   0.1000|
+|              |       |none  |     0|perplexity|↓  |605.3866|±  |1636.6987|
\ No newline at end of file
diff --git a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
deleted file mode 100644
index 085534ae..00000000
--- a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
-|----------------|------:|------|-----:|------|---|----:|---|------|
-|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
-|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
-|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
\ No newline at end of file
diff --git a/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
new file mode 100644
index 00000000..22aa6da0
--- /dev/null
+++ b/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -0,0 +1,22 @@
+|             Tasks             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|-------------------------------|------:|------|-----:|------|---|-----:|---|-----:|
+|stem                           |      1|none  |      |acc   |↑  |0.2474|±  |0.0315|
+| - abstract_algebra            |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
+| - anatomy                     |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - astronomy                   |      0|none  |     0|acc   |↑  |0.1000|±  |0.1000|
+| - college_biology             |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - college_chemistry           |      0|none  |     0|acc   |↑  |0.1000|±  |0.1000|
+| - college_computer_science    |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
+| - college_mathematics         |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
+| - college_physics             |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - computer_security           |      0|none  |     0|acc   |↑  |0.5000|±  |0.1667|
+| - conceptual_physics          |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - electrical_engineering      |      0|none  |     0|acc   |↑  |0.4000|±  |0.1633|
+| - elementary_mathematics      |      0|none  |     0|acc   |↑  |0.0000|±  |0.0000|
+| - high_school_biology         |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - high_school_chemistry       |      0|none  |     0|acc   |↑  |0.4000|±  |0.1633|
+| - high_school_computer_science|      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - high_school_mathematics     |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
+| - high_school_physics         |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
+| - high_school_statistics      |      0|none  |     0|acc   |↑  |0.0000|±  |0.0000|
+| - machine_learning            |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
\ No newline at end of file
diff --git a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 5f78f2c1..654e63ee 100644
--- a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,5 +1,5 @@
 | Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
 |--------|------:|------|-----:|---------------|---|-------:|---|------|
-|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
-|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
-|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |
\ No newline at end of file
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |   N/A|
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |   N/A|
+|        |       |none  |     0|word_perplexity|↓  |130.4801|±  |   N/A|
\ No newline at end of file
diff --git a/tests/testyamls/test-01.yaml b/tests/testyamls/test-01.yaml
new file mode 100644
index 00000000..4e4367ee
--- /dev/null
+++ b/tests/testyamls/test-01.yaml
@@ -0,0 +1,45 @@
+group: test-1
+group_alias: test 1
+task:
+  - piqa # string task
+  - ai2_arc # string tag
+  - task: super-glue-lm-eval-v1 # Should this be spread out?
+    num_fewshot: 3
+  - task: swag # dict registered task
+    num_fewshot: 2
+  - task: mmlu
+    num_fewshot: 5
+  - group: nli-tasks # dict group
+    task:
+      - anli
+      - boolq
+      - sglue_rte
+    num_fewshot: 4
+    metric_list:
+      - metric: brier_score
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 2-shot
+    num_fewshot: 2
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 4-shot
+    num_fewshot: 4
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 6-shot
+    num_fewshot: 6
+  - task: siqa_custom # dict task
+    dataset_path: social_i_qa
+    dataset_name: null
+    output_type: multiple_choice
+    training_split: train
+    validation_split: validation
+    doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
+    target_delimiter: " "
+    doc_to_choice:
+      - "{{answerA}}"
+      - "{{answerB}}"
+      - "{{answerC}}"
+    doc_to_target: "{{ (label|int) - 1 }}"
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
-- 
GitLab


From cb43ad4eae325da3ec949c02ee7c9fec9a8b2bd8 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:11:22 -0400
Subject: [PATCH 23/45] we run with bootstrap_iters=0 for printing tests
 (#2080)

---
 ...AI-pythia-14m-dtype-float32-device-cpu.txt | 10 ++---
 ...AI-pythia-14m-dtype-float32-device-cpu.txt |  8 ++--
 ...AI-pythia-14m-dtype-float32-device-cpu.txt | 42 +++++++++----------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 55ac2ccd..6ba01f21 100644
--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,6 +1,6 @@
 |    Tasks    |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
-|-------------|------:|------|-----:|--------|---|----:|---|-----:|
-|arc_challenge|      1|none  |     0|acc     |↑  |  0.0|±  |0.0000|
-|             |       |none  |     0|acc_norm|↑  |  0.0|±  |0.0000|
-|arc_easy     |      1|none  |     0|acc     |↑  |  0.3|±  |0.1528|
-|             |       |none  |     0|acc_norm|↑  |  0.1|±  |0.1000|
\ No newline at end of file
+|-------------|------:|------|-----:|--------|---|----:|---|------|
+|arc_challenge|      1|none  |     0|acc     |↑  |  0.0|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.0|±  |   N/A|
+|arc_easy     |      1|none  |     0|acc     |↑  |  0.3|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.1|±  |   N/A|
\ No newline at end of file
diff --git a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 32fce941..a3ea263b 100644
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,4 +1,4 @@
-|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   | Stderr  |
-|--------------|------:|------|-----:|----------|---|-------:|---|--------:|
-|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |   0.1000|
-|              |       |none  |     0|perplexity|↓  |605.3866|±  |1636.6987|
\ No newline at end of file
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-------:|---|------|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |   N/A|
+|              |       |none  |     0|perplexity|↓  |605.3866|±  |   N/A|
\ No newline at end of file
diff --git a/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
index 22aa6da0..bc2a36c3 100644
--- a/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -1,22 +1,22 @@
 |             Tasks             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
-|-------------------------------|------:|------|-----:|------|---|-----:|---|-----:|
-|stem                           |      1|none  |      |acc   |↑  |0.2474|±  |0.0315|
-| - abstract_algebra            |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
-| - anatomy                     |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - astronomy                   |      0|none  |     0|acc   |↑  |0.1000|±  |0.1000|
-| - college_biology             |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - college_chemistry           |      0|none  |     0|acc   |↑  |0.1000|±  |0.1000|
-| - college_computer_science    |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
-| - college_mathematics         |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
-| - college_physics             |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - computer_security           |      0|none  |     0|acc   |↑  |0.5000|±  |0.1667|
-| - conceptual_physics          |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - electrical_engineering      |      0|none  |     0|acc   |↑  |0.4000|±  |0.1633|
-| - elementary_mathematics      |      0|none  |     0|acc   |↑  |0.0000|±  |0.0000|
-| - high_school_biology         |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - high_school_chemistry       |      0|none  |     0|acc   |↑  |0.4000|±  |0.1633|
-| - high_school_computer_science|      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - high_school_mathematics     |      0|none  |     0|acc   |↑  |0.2000|±  |0.1333|
-| - high_school_physics         |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
-| - high_school_statistics      |      0|none  |     0|acc   |↑  |0.0000|±  |0.0000|
-| - machine_learning            |      0|none  |     0|acc   |↑  |0.3000|±  |0.1528|
\ No newline at end of file
+|-------------------------------|------:|------|-----:|------|---|-----:|---|------|
+|stem                           |      1|none  |      |acc   |↑  |0.2474|±  |   N/A|
+| - abstract_algebra            |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - anatomy                     |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - astronomy                   |      0|none  |     0|acc   |↑  |0.1000|±  |   N/A|
+| - college_biology             |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - college_chemistry           |      0|none  |     0|acc   |↑  |0.1000|±  |   N/A|
+| - college_computer_science    |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - college_mathematics         |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - college_physics             |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - computer_security           |      0|none  |     0|acc   |↑  |0.5000|±  |   N/A|
+| - conceptual_physics          |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - electrical_engineering      |      0|none  |     0|acc   |↑  |0.4000|±  |   N/A|
+| - elementary_mathematics      |      0|none  |     0|acc   |↑  |0.0000|±  |   N/A|
+| - high_school_biology         |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_chemistry       |      0|none  |     0|acc   |↑  |0.4000|±  |   N/A|
+| - high_school_computer_science|      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_mathematics     |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - high_school_physics         |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_statistics      |      0|none  |     0|acc   |↑  |0.0000|±  |   N/A|
+| - machine_learning            |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
\ No newline at end of file
-- 
GitLab


From ad80f55548f35e3158895a92c198ceabe09913a3 Mon Sep 17 00:00:00 2001
From: Elron Bandel <elronbandel@gmail.com>
Date: Mon, 8 Jul 2024 19:44:41 +0300
Subject: [PATCH 24/45] Easier unitxt tasks loading and removal of unitxt
 library dependancy (#1933)

* Updated unitxt loading

Signed-off-by: Elron Bandel <elron.bandel@ibm.com>

* Revert change to general Readme

Signed-off-by: Elron Bandel <elron.bandel@ibm.com>

* Adjust fda,squadv2,squad_completion and swde to work accept config in the constructor

Signed-off-by: Elron Bandel <elron.bandel@ibm.com>

* Fix scrolls

Signed-off-by: elronbandel <elron.bandel@ibm.com>

* Update documentation

Signed-off-by: elronbandel <elron.bandel@ibm.com>

* Enforce backward compatability

Signed-off-by: elronbandel <elron.bandel@ibm.com>

* Format unitxt class

Signed-off-by: elronbandel <elron.bandel@ibm.com>

---------

Signed-off-by: Elron Bandel <elron.bandel@ibm.com>
Signed-off-by: elronbandel <elron.bandel@ibm.com>
Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
---
 README.md                                     |   1 -
 docs/new_task_guide.md                        |  11 ++
 lm_eval/tasks/__init__.py                     |  23 ++-
 lm_eval/tasks/scrolls/task.py                 |   2 +-
 lm_eval/tasks/squadv2/task.py                 |   2 +-
 lm_eval/tasks/unitxt/20_newsgroups.yaml       |   4 +-
 lm_eval/tasks/unitxt/ag_news.yaml             |   4 +-
 lm_eval/tasks/unitxt/argument_topic.yaml      |   4 +-
 lm_eval/tasks/unitxt/atis.yaml                |   4 +-
 lm_eval/tasks/unitxt/banking77.yaml           |   4 +-
 lm_eval/tasks/unitxt/claim_stance_topic.yaml  |   4 +-
 lm_eval/tasks/unitxt/cnn_dailymail.yaml       |   4 +-
 lm_eval/tasks/unitxt/coedit_gec.yaml          |   4 +-
 lm_eval/tasks/unitxt/dbpedia_14.yaml          |   4 +-
 lm_eval/tasks/unitxt/ethos_binary.yaml        |   4 +-
 lm_eval/tasks/unitxt/financial_tweets.yaml    |   4 +-
 lm_eval/tasks/unitxt/generate_yamls.py        | 135 -----------------
 lm_eval/tasks/unitxt/law_stack_exchange.yaml  |   4 +-
 lm_eval/tasks/unitxt/ledgar.yaml              |   4 +-
 lm_eval/tasks/unitxt/medical_abstracts.yaml   |   4 +-
 lm_eval/tasks/unitxt/stsb.yaml                |   4 +-
 lm_eval/tasks/unitxt/task.py                  | 142 ++++++++++++++++++
 lm_eval/tasks/unitxt/unfair_tos.yaml          |   4 +-
 lm_eval/tasks/unitxt/unitxt                   |   1 +
 lm_eval/tasks/unitxt/unitxt_datasets          |  18 ---
 .../unitxt_tasks.classification.multi_class   |  24 ---
 .../unitxt_tasks.classification.multi_label   |  24 ---
 .../unitxt_tasks.grammatical_error_correction |  24 ---
 .../unitxt_tasks.qa.with_context.extractive   |  18 ---
 .../unitxt/unitxt_tasks.regression.two_texts  |  18 ---
 .../unitxt_tasks.span_labeling.extraction     |  18 ---
 .../unitxt_tasks.summarization.abstractive    |  18 ---
 lm_eval/tasks/unitxt/unitxt_wrapper.py        |  46 ------
 lm_eval/tasks/unitxt/xsum.yaml                |   4 +-
 .../tasks/unitxt/yahoo_answers_topics.yaml    |   4 +-
 pyproject.toml                                |   2 -
 36 files changed, 208 insertions(+), 391 deletions(-)
 delete mode 100644 lm_eval/tasks/unitxt/generate_yamls.py
 create mode 100644 lm_eval/tasks/unitxt/task.py
 create mode 100644 lm_eval/tasks/unitxt/unitxt
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_datasets
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
 delete mode 100644 lm_eval/tasks/unitxt/unitxt_wrapper.py

diff --git a/README.md b/README.md
index ed50e5e5..835f7472 100644
--- a/README.md
+++ b/README.md
@@ -454,7 +454,6 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | sentencepiece | For using the sentencepiece tokenizer |
 | sparseml      | For using NM's SparseML models        |
 | testing       | For running library test suite        |
-| unitxt        | For IBM's unitxt dataset tasks        |
 | vllm          | For loading models with vLLM          |
 | zeno          | For visualizing results with Zeno     |
 |---------------|---------------------------------------|
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 23f8dd81..e9bd2bec 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -403,6 +403,17 @@ task:
   ...
 ```
 
+You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
+Here's how to do it:
+
+```yaml
+task: 20_newsgroups
+class: !function task.Unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+```
+
+In this example, `recipe` is the custom argument for the `Unitxt` class.
+
 ## Beautifying Table Display
 
 To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 52c280b7..b567a3d4 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -1,4 +1,5 @@
 import collections
+import inspect
 import logging
 import os
 from functools import partial
@@ -151,6 +152,14 @@ class TaskManager:
                 config["group_alias"] = None
         return config
 
+    def _class_has_config_in_constructor(self, cls):
+        constructor = getattr(cls, "__init__", None)
+        return (
+            "config" in inspect.signature(constructor).parameters
+            if constructor
+            else False
+        )
+
     def _load_individual_task_or_group(
         self,
         name_or_config: Optional[Union[str, dict]] = None,
@@ -168,13 +177,13 @@ class TaskManager:
                     **config,
                 }
             if self._config_is_python_task(config):
-                task_object = (
-                    config["class"](config=config)
-                    if issubclass(config["class"], ConfigurableTask)
-                    else config["class"]()
-                )
-                # very scuffed: set task name here. TODO: fixme?
-                task_object.config.task = config["task"]
+                if self._class_has_config_in_constructor(config["class"]):
+                    task_object = config["class"](config=config)
+                else:
+                    task_object = config["class"]()
+                if isinstance(task_object, ConfigurableTask):
+                    # very scuffed: set task name here. TODO: fixme?
+                    task_object.config.task = config["task"]
             else:
                 task_object = ConfigurableTask(config=config)
 
diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py
index 2940eb2e..45656be3 100644
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -116,7 +116,7 @@ class _SCROLLSTask(ConfigurableTask):
     PRUNE_MAX_TOKENS = None
     PRUNE_NUM_PROC = None
 
-    def __init__(self):
+    def __init__(self, config=None):
         super().__init__(config={"metadata": {"version": self.VERSION}})
         if self.DATASET_NAME is not None:
             self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py
index 4c3d7f6f..32c44c60 100644
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask):
     DATASET_PATH = "squad_v2"
     DATASET_NAME = None
 
-    def __init__(self):
+    def __init__(self, config=None):
         super().__init__(config={"metadata": {"version": self.VERSION}})
 
     # HF changed squad on us so we have to make sure we aren't running the old one
diff --git a/lm_eval/tasks/unitxt/20_newsgroups.yaml b/lm_eval/tasks/unitxt/20_newsgroups.yaml
index 1a5af68a..f2444bd2 100644
--- a/lm_eval/tasks/unitxt/20_newsgroups.yaml
+++ b/lm_eval/tasks/unitxt/20_newsgroups.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: 20_newsgroups
-dataset_name: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/ag_news.yaml b/lm_eval/tasks/unitxt/ag_news.yaml
index 32914bda..792ce0b4 100644
--- a/lm_eval/tasks/unitxt/ag_news.yaml
+++ b/lm_eval/tasks/unitxt/ag_news.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: ag_news
-dataset_name: card=cards.ag_news,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.ag_news,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/argument_topic.yaml b/lm_eval/tasks/unitxt/argument_topic.yaml
index c333e194..d04810cd 100644
--- a/lm_eval/tasks/unitxt/argument_topic.yaml
+++ b/lm_eval/tasks/unitxt/argument_topic.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: argument_topic
-dataset_name: card=cards.argument_topic,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.argument_topic,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/atis.yaml b/lm_eval/tasks/unitxt/atis.yaml
index 1e7979be..e9a26697 100644
--- a/lm_eval/tasks/unitxt/atis.yaml
+++ b/lm_eval/tasks/unitxt/atis.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.span_labeling.extraction
 task: atis
-dataset_name: card=cards.atis,template=templates.span_labeling.extraction.title
+include: unitxt
+recipe: card=cards.atis,template=templates.span_labeling.extraction.title
diff --git a/lm_eval/tasks/unitxt/banking77.yaml b/lm_eval/tasks/unitxt/banking77.yaml
index a888e6a4..6475575d 100644
--- a/lm_eval/tasks/unitxt/banking77.yaml
+++ b/lm_eval/tasks/unitxt/banking77.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: banking77
-dataset_name: card=cards.banking77,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.banking77,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/claim_stance_topic.yaml b/lm_eval/tasks/unitxt/claim_stance_topic.yaml
index 5f8a6133..2a2469d5 100644
--- a/lm_eval/tasks/unitxt/claim_stance_topic.yaml
+++ b/lm_eval/tasks/unitxt/claim_stance_topic.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: claim_stance_topic
-dataset_name: card=cards.claim_stance_topic,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.claim_stance_topic,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/cnn_dailymail.yaml b/lm_eval/tasks/unitxt/cnn_dailymail.yaml
index f3f26cd5..aa3748c8 100644
--- a/lm_eval/tasks/unitxt/cnn_dailymail.yaml
+++ b/lm_eval/tasks/unitxt/cnn_dailymail.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.summarization.abstractive
 task: cnn_dailymail
-dataset_name: card=cards.cnn_dailymail,template=templates.summarization.abstractive.full
+include: unitxt
+recipe: card=cards.cnn_dailymail,template=templates.summarization.abstractive.full
diff --git a/lm_eval/tasks/unitxt/coedit_gec.yaml b/lm_eval/tasks/unitxt/coedit_gec.yaml
index 619be722..49590646 100644
--- a/lm_eval/tasks/unitxt/coedit_gec.yaml
+++ b/lm_eval/tasks/unitxt/coedit_gec.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.grammatical_error_correction
 task: coedit_gec
-dataset_name: card=cards.coedit_gec,template=templates.grammatical_error_correction.simple
+include: unitxt
+recipe: card=cards.coedit_gec,template=templates.grammatical_error_correction.simple
diff --git a/lm_eval/tasks/unitxt/dbpedia_14.yaml b/lm_eval/tasks/unitxt/dbpedia_14.yaml
index 6ef2df71..b26d65a7 100644
--- a/lm_eval/tasks/unitxt/dbpedia_14.yaml
+++ b/lm_eval/tasks/unitxt/dbpedia_14.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: dbpedia_14
-dataset_name: card=cards.dbpedia_14,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.dbpedia_14,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/ethos_binary.yaml b/lm_eval/tasks/unitxt/ethos_binary.yaml
index 8a3b1670..3976de43 100644
--- a/lm_eval/tasks/unitxt/ethos_binary.yaml
+++ b/lm_eval/tasks/unitxt/ethos_binary.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: ethos_binary
-dataset_name: card=cards.ethos_binary,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.ethos_binary,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/financial_tweets.yaml b/lm_eval/tasks/unitxt/financial_tweets.yaml
index 8d804e7a..7b4bb9e5 100644
--- a/lm_eval/tasks/unitxt/financial_tweets.yaml
+++ b/lm_eval/tasks/unitxt/financial_tweets.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: financial_tweets
-dataset_name: card=cards.financial_tweets,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.financial_tweets,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/generate_yamls.py b/lm_eval/tasks/unitxt/generate_yamls.py
deleted file mode 100644
index b1b9c004..00000000
--- a/lm_eval/tasks/unitxt/generate_yamls.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#
-# This file generates a set of LM eval harness yaml file
-# that load unitxt datasets (https://github.com/IBM/unitxt)
-#
-
-import unitxt_wrapper
-import yaml
-from unitxt.artifact import fetch_artifact
-from unitxt.standard import StandardRecipe
-
-
-# This code is required to properly dump LM harness YAML that contains references to functions
-def function_representer(dumper: yaml.SafeDumper, func) -> yaml.nodes.MappingNode:
-    return dumper.represent_scalar(
-        "!function", f"{func.__module__}.{func.__name__}", style=None
-    )
-
-
-def write_task_yaml(filename, data):
-    yaml.add_representer(type(data["process_results"]), function_representer)
-    with open(filename, "w") as stream:
-        yaml.dump(data, stream, sort_keys=False)
-
-
-def write_card_yaml(filename, data):
-    with open(filename, "w") as stream:
-        yaml.dump(data, stream, sort_keys=False)
-
-
-default_template_per_task = {
-    "tasks.classification.multi_label": "templates.classification.multi_label.title",
-    "tasks.classification.multi_class": "templates.classification.multi_class.title",
-    "tasks.summarization.abstractive": "templates.summarization.abstractive.full",
-    "tasks.regression.two_texts": "templates.regression.two_texts.simple",
-    "tasks.qa.with_context.extractive": "templates.qa.with_context.simple",
-    "tasks.grammatical_error_correction": "templates.grammatical_error_correction.simple",
-    "tasks.span_labeling.extraction": "templates.span_labeling.extraction.title",
-}
-
-
-def generate_task_yaml(task: str):
-    """
-    Generate an LM Eval Harness YAML file based on a Unitxt task defintion.
-    The output YAML is based on 'template.yaml.file' found in current directoy.
-
-    The common template is filled the the specific metrics for the task.
-    It still leaves the 'dataset_name' and 'task name' unspecified.
-    """
-    print("*" * 80)
-    print("*")
-    print(f"* Generating YAML base file for task {task}")
-    print("*")
-    task_definition, _ = fetch_artifact(task)
-    data = {
-        "group": ["unitxt"],
-        "dataset_path": "unitxt/data",
-        "output_type": "generate_until",
-        "training_split": "train",
-        "validation_split": "test",
-        "doc_to_text": "{{source}}",
-        "doc_to_target": "target",
-        "process_results": unitxt_wrapper.process_results,
-        "generation_kwargs": {"until": ["</s>"]},
-        "metric_list": [],
-        "metadata": {"verison": 1.0},
-    }
-
-    for metric_name in task_definition.metrics:
-        new_metric = {"metric": "", "aggregation": "unitxt", "higher_is_better": True}
-        new_metric["metric"] = metric_name.replace("metrics.", "unitxt_")
-        data["metric_list"].append(new_metric)
-
-    write_task_yaml(f"unitxt_{task}", data)
-
-
-def generate_card_yaml(card: str):
-    """
-    Generate an LM Eval Harness YAML file based on the Unitxt dataset card.
-    It includes the task YAML for the dataset, and overrides the 'dataset_name' and 'task' with the card.
-    """
-
-    print("*" * 80)
-    print("*")
-    print(f"* Generating YAML file for unitxt dataset {card}")
-    print("*")
-
-    card_definition, _ = fetch_artifact(f"cards.{card}")
-    task = card_definition.task.__id__
-    if task in default_template_per_task:
-        template = default_template_per_task[task]
-    else:
-        raise ValueError(
-            f"Default template was not defined for task {task} in 'default_template_per_task' dict in generate_yamls.py"
-        )
-    data = {}
-    data["include"] = f"unitxt_{task}"
-    data["task"] = card
-    data["dataset_name"] = f"card=cards.{card},template={template}"
-    # This is faster that the load_dataset approach
-    # dataset = load_dataset('unitxt/data',  data["dataset_name"]+",loader_limit=100",trust_remote_code=True)
-    recipe = StandardRecipe(card=f"cards.{card}", template=template, loader_limit=100)
-    stream = recipe()
-    dataset = stream.to_dataset()
-    print(dataset)
-    print("Sample input:")
-    print(dataset["test"][0]["source"])
-    print("Sample output:")
-    print(dataset["test"][0]["target"])
-    write_card_yaml(f"{card}.yaml", data)
-
-
-def main():
-    for task in default_template_per_task.keys():
-        try:
-            generate_task_yaml(task)
-        except Exception as e:
-            print(f"Unable to generate YAML for {task} due to:")
-            print(e)
-            raise (e)
-    with open("unitxt_datasets") as f:
-        for unitxt_dataset in f:
-            unitxt_dataset = unitxt_dataset.strip()
-            if unitxt_dataset.startswith("### END ###"):
-                exit(0)
-            if not unitxt_dataset.startswith("#"):
-                try:
-                    generate_card_yaml(unitxt_dataset)
-                except Exception as e:
-                    print(f"Unable to generate YAML for {unitxt_dataset} due to:")
-                    print(e)
-                    raise e
-
-
-if __name__ == "__main__":
-    main()
diff --git a/lm_eval/tasks/unitxt/law_stack_exchange.yaml b/lm_eval/tasks/unitxt/law_stack_exchange.yaml
index 780af103..d0c589a3 100644
--- a/lm_eval/tasks/unitxt/law_stack_exchange.yaml
+++ b/lm_eval/tasks/unitxt/law_stack_exchange.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: law_stack_exchange
-dataset_name: card=cards.law_stack_exchange,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.law_stack_exchange,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/ledgar.yaml b/lm_eval/tasks/unitxt/ledgar.yaml
index 6d93ef9d..1c315897 100644
--- a/lm_eval/tasks/unitxt/ledgar.yaml
+++ b/lm_eval/tasks/unitxt/ledgar.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: ledgar
-dataset_name: card=cards.ledgar,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.ledgar,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/medical_abstracts.yaml b/lm_eval/tasks/unitxt/medical_abstracts.yaml
index dd4e87b6..74cfef0b 100644
--- a/lm_eval/tasks/unitxt/medical_abstracts.yaml
+++ b/lm_eval/tasks/unitxt/medical_abstracts.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: medical_abstracts
-dataset_name: card=cards.medical_abstracts,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.medical_abstracts,template=templates.classification.multi_class.title
diff --git a/lm_eval/tasks/unitxt/stsb.yaml b/lm_eval/tasks/unitxt/stsb.yaml
index 44e78c58..8d91b0e1 100644
--- a/lm_eval/tasks/unitxt/stsb.yaml
+++ b/lm_eval/tasks/unitxt/stsb.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.regression.two_texts
 task: stsb
-dataset_name: card=cards.stsb,template=templates.regression.two_texts.simple
+include: unitxt
+recipe: card=cards.stsb,template=templates.regression.two_texts.simple
diff --git a/lm_eval/tasks/unitxt/task.py b/lm_eval/tasks/unitxt/task.py
new file mode 100644
index 00000000..339a3076
--- /dev/null
+++ b/lm_eval/tasks/unitxt/task.py
@@ -0,0 +1,142 @@
+"""
+In the dynamic landscape of generative NLP, traditional text processing pipelines limit research flexibility and reproducibility, as they are tailored to specific dataset, task, and model combinations. The escalating complexity, involving system prompts, model-specific formats, instructions, and more, calls for a shift to a structured, modular, and customizable solution.
+
+Addressing this need, we present Unitxt, an innovative library for customizable textual data preparation and evaluation tailored to generative language models. Unitxt natively integrates with common libraries like HuggingFace and LM-eval-harness and deconstructs processing flows into modular components, enabling easy customization and sharing between practitioners. These components encompass model-specific formats, task prompts, and many other comprehensive dataset processing definitions. The Unitxt-Catalog centralizes these components, fostering collaboration and exploration in modern textual data workflows. Beyond being a tool, Unitxt is a community-driven platform, empowering users to build, share, and advance their pipelines collaboratively.
+"""
+
+from functools import partial
+from typing import Optional
+
+import evaluate
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.task import ConfigurableTask
+
+
+_CITATION = """
+@misc{bandel2024unitxt,
+      title={Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI},
+      author={Elron Bandel and Yotam Perlitz and Elad Venezian and Roni Friedman-Melamed and Ofir Arviv and Matan Orbach and Shachar Don-Yehyia and Dafna Sheinwald and Ariel Gera and Leshem Choshen and Michal Shmueli-Scheuer and Yoav Katz},
+      year={2024},
+      eprint={2401.14019},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
+
+
+def score(items, metric):
+    predictions, references = zip(*items)
+    evaluator = evaluate.load("unitxt/metric")
+    for reference in references:
+        reference["metrics"] = [metric]
+    results = evaluator.compute(predictions=predictions, references=references)
+    return results[0]["score"]["global"]["score"]
+
+
+class Unitxt(ConfigurableTask):
+    VERSION = 0
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        assert "recipe" in config, "Unitxt task must have a 'recipe' string."
+        super().__init__(
+            config={
+                "metadata": {"version": self.VERSION},
+                "dataset_kwargs": {"trust_remote_code": True},
+                "dataset_name": config["recipe"],
+                "dataset_path": "unitxt/data",
+            }
+        )
+        self.metrics = self.dataset["test"][0]["metrics"]
+
+    def has_training_docs(self):
+        return "train" in self.dataset
+
+    def has_validation_docs(self):
+        return "validation" in self.dataset
+
+    def has_test_docs(self):
+        return "test" in self.dataset
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        return doc["source"]
+
+    def should_decontaminate(self):
+        return False
+
+    def doc_to_target(self, doc):
+        doc["target"]
+
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+
+        return [
+            Instance(
+                request_type="generate_until",
+                doc=doc,
+                arguments=(ctx, {"until": ["\n"]}),
+                idx=0,
+                **kwargs,
+            )
+        ]
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+
+        continuation = results[0]
+
+        predictions = continuation
+
+        references = doc
+        return {
+            metric.replace("metrics.", ""): (predictions, references)
+            for metric in self.metrics
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            metric.replace("metrics.", ""): partial(score, metric=metric)
+            for metric in self.metrics
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {metric.replace("metrics.", ""): True for metric in self.metrics}
diff --git a/lm_eval/tasks/unitxt/unfair_tos.yaml b/lm_eval/tasks/unitxt/unfair_tos.yaml
index 2c52c316..b401dfef 100644
--- a/lm_eval/tasks/unitxt/unfair_tos.yaml
+++ b/lm_eval/tasks/unitxt/unfair_tos.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_label
 task: unfair_tos
-dataset_name: card=cards.unfair_tos,template=templates.classification.multi_label.title
+include: unitxt
+recipe: card=cards.unfair_tos,template=templates.classification.multi_label.title
diff --git a/lm_eval/tasks/unitxt/unitxt b/lm_eval/tasks/unitxt/unitxt
new file mode 100644
index 00000000..e6902c46
--- /dev/null
+++ b/lm_eval/tasks/unitxt/unitxt
@@ -0,0 +1 @@
+class: !function task.Unitxt
diff --git a/lm_eval/tasks/unitxt/unitxt_datasets b/lm_eval/tasks/unitxt/unitxt_datasets
deleted file mode 100644
index cdafa0cf..00000000
--- a/lm_eval/tasks/unitxt/unitxt_datasets
+++ /dev/null
@@ -1,18 +0,0 @@
-coedit_gec
-atis
-20_newsgroups
-ag_news
-argument_topic
-banking77
-claim_stance_topic
-cnn_dailymail
-dbpedia_14
-ethos_binary
-financial_tweets
-law_stack_exchange
-ledgar
-medical_abstracts
-stsb
-unfair_tos
-xsum
-yahoo_answers_topics
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class
deleted file mode 100644
index 7e248b83..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class
+++ /dev/null
@@ -1,24 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_f1_micro
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_accuracy
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_f1_macro
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label
deleted file mode 100644
index 871ebda7..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label
+++ /dev/null
@@ -1,24 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_f1_micro_multi_label
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_accuracy
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_f1_macro_multi_label
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction b/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction
deleted file mode 100644
index 71dbfdda..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction
+++ /dev/null
@@ -1,24 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_char_edit_dist_accuracy
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_rouge
-  aggregation: unitxt
-  higher_is_better: true
-- metric: unitxt_char_edit_distance[reference_field=original_text]
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive b/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive
deleted file mode 100644
index 72ecb313..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive
+++ /dev/null
@@ -1,18 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_squad
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts b/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts
deleted file mode 100644
index 6db5d587..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts
+++ /dev/null
@@ -1,18 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_spearman
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction b/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
deleted file mode 100644
index 60a5e74a..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+++ /dev/null
@@ -1,18 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_ner
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive b/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
deleted file mode 100644
index 4b5d97f9..00000000
--- a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+++ /dev/null
@@ -1,18 +0,0 @@
-group:
-- unitxt
-dataset_path: unitxt/data
-output_type: generate_until
-training_split: train
-validation_split: test
-doc_to_text: '{{source}}'
-doc_to_target: target
-process_results: !function 'unitxt_wrapper.process_results'
-generation_kwargs:
-  until:
-  - </s>
-metric_list:
-- metric: unitxt_rouge
-  aggregation: unitxt
-  higher_is_better: true
-metadata:
-  verison: 1.0
diff --git a/lm_eval/tasks/unitxt/unitxt_wrapper.py b/lm_eval/tasks/unitxt/unitxt_wrapper.py
deleted file mode 100644
index cfb2ec38..00000000
--- a/lm_eval/tasks/unitxt/unitxt_wrapper.py
+++ /dev/null
@@ -1,46 +0,0 @@
-try:
-    from unitxt import evaluate
-except ImportError:
-    raise ImportError(
-        "Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
-    )
-
-from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric
-
-
-def unitxt_agg_metric(items):
-    preds = [pred[0] for pred, _, _ in items]
-    refs = [ref for _, ref, _ in items]
-    metric_name = items[0][2].replace("unitxt_", "metrics.")
-    for ref in refs:
-        ref["metrics"] = [metric_name]
-
-    result_metrics = evaluate(preds, refs)
-    return result_metrics[0]["score"]["global"]["score"]
-
-
-AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric
-
-
-def unitxt_metric(items):  # This is a passthrough function
-    return items
-
-
-def process_results(doc, results):
-    metrics = doc["metrics"]
-    scores = {}
-    for metric in metrics:
-        metric = metric.replace("metrics.", "unitxt_")
-        scores[metric] = (results, doc, metric)
-
-        if metric not in METRIC_REGISTRY:
-            register_metric(
-                metric=metric,
-                higher_is_better=True,
-                output_type="generate_until",
-                aggregation="unitxt",
-            )(unitxt_metric)
-    return scores
-
-
-#
diff --git a/lm_eval/tasks/unitxt/xsum.yaml b/lm_eval/tasks/unitxt/xsum.yaml
index 309e3039..6fe2999d 100644
--- a/lm_eval/tasks/unitxt/xsum.yaml
+++ b/lm_eval/tasks/unitxt/xsum.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.summarization.abstractive
 task: xsum
-dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full
+include: unitxt
+recipe: card=cards.xsum,template=templates.summarization.abstractive.full
diff --git a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
index 3cb8ad20..6bf12fae 100644
--- a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+++ b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
@@ -1,3 +1,3 @@
-include: unitxt_tasks.classification.multi_class
 task: yahoo_answers_topics
-dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
+include: unitxt
+recipe: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
diff --git a/pyproject.toml b/pyproject.toml
index 040a9b7c..08cc47f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,6 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
-unitxt = ["unitxt"]
 all = [
     "lm_eval[anthropic]",
     "lm_eval[dev]",
@@ -95,7 +94,6 @@ all = [
     "lm_eval[vllm]",
     "lm_eval[zeno]",
     "lm_eval[wandb]",
-    "lm_eval[unitxt]"
 ]
 
 [tool.ruff.lint]
-- 
GitLab


From 563f797189aabec36de08bfb3ea790b42890d278 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Mon, 8 Jul 2024 21:56:22 +0200
Subject: [PATCH 25/45] Allow gating EvaluationTracker HF Hub results;
 customizability (#2051)

* batch commit

* :Revert "batch commit"

This reverts commit d859d1ca3bc2d6e45979ec51379df119f2a9633d.

* batch commit

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup eval results

* cleanup

* add check for gated repo

* fix jsonline issue

* fix

* add try catch when gating the details repo

* add doc

* adds back hub_repo_name

* readds hub repo name
---
 docs/interface.md                     |   5 +-
 lm_eval/loggers/evaluation_tracker.py | 149 +++++++++++++-------------
 lm_eval/loggers/utils.py              |   6 +-
 lm_eval/utils.py                      |   2 +-
 4 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/docs/interface.md b/docs/interface.md
index 89e0397b..8e2fa26b 100644
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -58,12 +58,15 @@ This mode supports a number of command-line arguments, the details of which can
 
 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
     * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
-    * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
+    * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+    * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+    * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
     * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
     * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
     * `public_repo` - whether the repository is public, can be `True` or `False`,
     * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
     * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
+    * `gated` - whether to gate the details dataset, can be `True` or `False`.
 
 ## External Library Usage
 
diff --git a/lm_eval/loggers/evaluation_tracker.py b/lm_eval/loggers/evaluation_tracker.py
index 6aa7f25b..429d70b2 100644
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -1,4 +1,5 @@
 import json
+import os
 import re
 import time
 from collections import defaultdict
@@ -14,6 +15,7 @@ from huggingface_hub import (
     HfApi,
     hf_hub_url,
 )
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
 
 from lm_eval.utils import (
     eval_logger,
@@ -112,12 +114,15 @@ class EvaluationTracker:
         output_path: str = None,
         hub_results_org: str = "",
         hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
         push_results_to_hub: bool = False,
         push_samples_to_hub: bool = False,
         public_repo: bool = False,
         token: str = "",
         leaderboard_url: str = "",
         point_of_contact: str = "",
+        gated: bool = False,
     ) -> None:
         """
         Creates all the necessary loggers for evaluation tracking.
@@ -126,12 +131,15 @@ class EvaluationTracker:
             output_path (str): Path to save the results. If not provided, the results won't be saved.
             hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
             hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
             push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
             push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
             public_repo (bool): Whether to push the results to a public or private repository.
             token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
             leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
             point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
         """
         self.general_config_tracker = GeneralConfigTracker()
 
@@ -142,6 +150,7 @@ class EvaluationTracker:
         self.leaderboard_url = leaderboard_url
         self.point_of_contact = point_of_contact
         self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
 
         if not self.api and (push_results_to_hub or push_samples_to_hub):
             raise ValueError(
@@ -159,9 +168,24 @@ class EvaluationTracker:
                 f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
             )
 
-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+        if hub_repo_name == "":
+            details_repo_name = (
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
 
     def save_results_aggregated(
         self,
@@ -211,9 +235,9 @@ class EvaluationTracker:
 
                 if self.api and self.push_results_to_hub:
                     repo_id = (
-                        self.hub_results_repo
+                        self.results_repo
                         if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.results_repo_private
                     )
                     self.api.create_repo(
                         repo_id=repo_id,
@@ -221,10 +245,15 @@ class EvaluationTracker:
                         private=not self.public_repo,
                         exist_ok=True,
                     )
-                    self.api.upload_folder(
+                    self.api.upload_file(
                         repo_id=repo_id,
-                        folder_path=str(path),
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        path_or_fileobj=str(
+                            path.joinpath(f"results_{self.date_id}.json")
+                        ),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            f"results_{self.date_id}.json",
+                        ),
                         repo_type="dataset",
                         commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                     )
@@ -278,6 +307,7 @@ class EvaluationTracker:
                     sample["resps"] = sanitize_list(sample["resps"])
                     sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                     sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
 
                     sample_dump = (
                         json.dumps(
@@ -293,9 +323,9 @@ class EvaluationTracker:
 
                 if self.api and self.push_samples_to_hub:
                     repo_id = (
-                        self.hub_results_repo
+                        self.details_repo
                         if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.details_repo_private
                     )
                     self.api.create_repo(
                         repo_id=repo_id,
@@ -303,6 +333,18 @@ class EvaluationTracker:
                         private=not self.public_repo,
                         exist_ok=True,
                     )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
                     self.api.upload_folder(
                         repo_id=repo_id,
                         folder_path=str(path),
@@ -327,9 +369,7 @@ class EvaluationTracker:
         """
 
         eval_logger.info("Recreating metadata card")
-        repo_id = (
-            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
-        )
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
 
         files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
         results_files = get_results_filenames(files_in_repo)
@@ -360,7 +400,10 @@ class EvaluationTracker:
                 results_datetime,
             )
             latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
 
         # Create metadata card
         card_metadata = MetadataConfigs()
@@ -377,14 +420,15 @@ class EvaluationTracker:
             sanitized_last_eval_date_results = re.sub(
                 r"[^\w\.]", "_", latest_task_results_datetime[config_name]
             )
-            # Ensure that all results files are listed in the metadata card
-            current_results = card_metadata.get(config_name, {"data_files": []})
-            current_results["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_results
-            # If the results file is the newest, update the "latest" field in the metadata card
+
             if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
                 card_metadata[config_name]["data_files"].append(
                     {"split": "latest", "path": [str(results_filename)]}
                 )
@@ -403,65 +447,20 @@ class EvaluationTracker:
             sanitized_last_eval_date_results = re.sub(
                 r"[^\w\.]", "_", latest_task_results_datetime[config_name]
             )
-            # Ensure that all sample results files are listed in the metadata card
-            current_details_for_task = card_metadata.get(
-                config_name, {"data_files": []}
-            )
-            current_details_for_task["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_details_for_task
-            # If the samples results file is the newest, update the "latest" field in the metadata card
             if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
                 card_metadata[config_name]["data_files"].append(
                     {"split": "latest", "path": [str(results_filename)]}
                 )
 
-            # Special case for MMLU with a single split covering it all
-            # We add another config with all MMLU splits results together for easy inspection
-            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
-            for special_task in SPECIAL_TASKS:
-                if special_task in config_name:
-                    special_task = f"{model_name}__{special_task}"
-                    former_entry = card_metadata.get(special_task, {"data_files": []})
-
-                    former_split = [
-                        (i, entry)
-                        for i, entry in enumerate(former_entry["data_files"])
-                        if entry.get("split", None) == eval_date_sanitized
-                    ]
-
-                    if len(former_split) == 0:
-                        former_entry["data_files"].append(
-                            {
-                                "split": eval_date_sanitized,
-                                "path": [str(results_filename)],
-                            }
-                        )
-                    else:
-                        split_index, _ = former_split[0]
-                        former_entry["data_files"][split_index]["path"].append(
-                            str(results_filename)
-                        )
-
-                    if eval_date_sanitized == sanitized_last_eval_date_results:
-                        latest_split = [
-                            (i, entry)
-                            for i, entry in enumerate(former_entry["data_files"])
-                            if entry.get("split", None) == "latest"
-                        ]
-                        if len(latest_split) == 0:
-                            former_entry["data_files"].append(
-                                {"split": "latest", "path": [str(results_filename)]}
-                            )
-                        else:
-                            latest_index, _ = latest_split[0]
-                            former_entry["data_files"][latest_index]["path"].append(
-                                str(results_filename)
-                            )
-
-                    card_metadata[special_task] = former_entry
-
         # Get latest results and extract info to update metadata card examples
         latest_datetime = max(latest_task_results_datetime.values())
         latest_model_name = max(
diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py
index 348175a9..ded8f820 100644
--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -118,15 +118,15 @@ def add_tokenizer_info(storage: Dict[str, Any], lm):
             tokenizer_info = {
                 "tokenizer_pad_token": [
                     lm.tokenizer.pad_token,
-                    lm.tokenizer.pad_token_id,
+                    str(lm.tokenizer.pad_token_id),
                 ],
                 "tokenizer_eos_token": [
                     lm.tokenizer.eos_token,
-                    lm.tokenizer.eos_token_id,
+                    str(lm.tokenizer.eos_token_id),
                 ],
                 "tokenizer_bos_token": [
                     lm.tokenizer.bos_token,
-                    lm.tokenizer.bos_token_id,
+                    str(lm.tokenizer.bos_token_id),
                 ],
                 "eot_token_id": getattr(lm, "eot_token_id", None),
                 "max_length": getattr(lm, "max_length", None),
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 62d500cc..652be78e 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -163,7 +163,7 @@ def get_file_datetime(filename: str) -> str:
     """
     Given the results and sample results filenames, extracts and returns the datetime.
     """
-    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+    return filename[filename.rfind("_") + 1 :].replace(".jsonl", "")
 
 
 def sanitize_model_name(model_name: str) -> str:
-- 
GitLab


From be01651cf6b611c09f088c336db74ea625227716 Mon Sep 17 00:00:00 2001
From: Pankaj Mathur <pankymathur@gmail.com>
Date: Mon, 8 Jul 2024 15:57:05 -0400
Subject: [PATCH 26/45] Minor doc fix: leaderboard README.md missing mmlu-pro
 group and task (#2075)

leaderboard README.md missing mmlu-pro group and task
---
 lm_eval/tasks/leaderboard/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lm_eval/tasks/leaderboard/README.md b/lm_eval/tasks/leaderboard/README.md
index 70c8a3a1..dbb72ea8 100644
--- a/lm_eval/tasks/leaderboard/README.md
+++ b/lm_eval/tasks/leaderboard/README.md
@@ -264,6 +264,15 @@ progress in the field.
 }
 ```
 
+### Groups
+
+- `leaderboard_mmlu_pro`
+
+### Tasks
+
+- `leaderboard_mmlu_pro`
+
+
 ## Musr
 
 ### Paper
-- 
GitLab


From c12e5becf09c5878175304bcc709059cefd29b8e Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 8 Jul 2024 23:41:38 -0400
Subject: [PATCH 27/45] fix: utf-8 encoding for logged sample files was missing
 (#2082)

---
 lm_eval/loggers/evaluation_tracker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/loggers/evaluation_tracker.py b/lm_eval/loggers/evaluation_tracker.py
index 429d70b2..067b047b 100644
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -318,7 +318,7 @@ class EvaluationTracker:
                         + "\n"
                     )
 
-                    with open(file_results_samples, "a") as f:
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
                         f.write(sample_dump)
 
                 if self.api and self.push_samples_to_hub:
-- 
GitLab


From 058cfd0eeb022c0bc4862651a3ae08e4e046a106 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Wed, 10 Jul 2024 21:21:09 +0700
Subject: [PATCH 28/45] Update utils.py (#2085)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Group Configs with no aggregation will print a empty space as the score for result table.
Example
```
|    Tasks     |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|--------------|-------|------|-----:|--------|---|-----:|---|-----:|
|group         |    N/A|      |      |        |   |      |   |      |
| - task 0     |Yaml   |none  |     0|acc     |↑  |0.4000|±  |0.0910|
| - task 1     |Yaml   |none  |     0|acc     |↑  |0.3333|±  |0.0875|
| - task 2     |Yaml   |none  |     0|acc     |↑  |0.2667|±  |0.0821|
| - task 3     |Yaml   |none  |     0|acc     |↑  |0.3333|±  |0.0875|
```

So the `v` variable in the `make_table` needs to check if the value is a float or a string.
---
 lm_eval/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 652be78e..7af6791f 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -361,12 +361,14 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
 
             hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
 
+            v = "%.4f" % v if isinstance(v, float) else v
+
             if m + "_stderr" + "," + f in dic:
                 se = dic[m + "_stderr" + "," + f]
                 se = "   N/A" if se == "N/A" else "%.4f" % se
-                values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
+                values.append([k, version, f, n, m, hib, v, "±", se])
             else:
-                values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
+                values.append([k, version, f, n, m, hib, v, "", ""])
             k = ""
             version = ""
     md_writer.value_matrix = values
-- 
GitLab


From 30273b47b57db546013db013c7f65046d3d7bfbf Mon Sep 17 00:00:00 2001
From: meg <90473723+meg-huggingface@users.noreply.github.com>
Date: Wed, 10 Jul 2024 10:54:13 -0700
Subject: [PATCH 29/45] batch_size may be str if 'auto' is specified (#2084)

---
 lm_eval/evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 635810de..ea474675 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -50,7 +50,7 @@ def simple_evaluate(
     model_args: Optional[Union[str, dict]] = None,
     tasks: Optional[List[Union[str, dict, object]]] = None,
     num_fewshot: Optional[int] = None,
-    batch_size: Optional[int] = None,
+    batch_size: Optional[Union[int, str]] = None,
     max_batch_size: Optional[int] = None,
     device: Optional[str] = None,
     use_cache: Optional[str] = None,
-- 
GitLab


From a0243d5496e1d45adebd71d151caf3cacc4b1fdf Mon Sep 17 00:00:00 2001
From: anthony-dipofi <anthonydipofi@gmail.com>
Date: Thu, 11 Jul 2024 14:10:10 -0500
Subject: [PATCH 30/45] Prettify lm_eval --tasks list (#1929)

* add  and ; move task list newline logic to new TaskManager.list_all_tasks() method

* format table list into markdown table; add config location column

* add Output Type column

* add logic for printing table of tags separately

* merge with main and fix conflicts ; update docstrings

---------

Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
---
 lm_eval/__main__.py       | 17 +++++---
 lm_eval/tasks/__init__.py | 88 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index 711df07a..439cdbea 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -73,7 +73,7 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         type=str,
         metavar="task1,task2",
-        help="To get full list of tasks, use the command lm-eval --tasks list",
+        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
     )
     parser.add_argument(
         "--model_args",
@@ -318,9 +318,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         eval_logger.error("Need to specify task to evaluate.")
         sys.exit()
     elif args.tasks == "list":
-        eval_logger.info(
-            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
-        )
+        print(task_manager.list_all_tasks())
+        sys.exit()
+    elif args.tasks == "list_groups":
+        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
         sys.exit()
     else:
         if os.path.isdir(args.tasks):
@@ -349,7 +356,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                     f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                 )
                 raise ValueError(
-                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
                 )
 
     # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index b567a3d4..b8927179 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -36,6 +36,16 @@ class TaskManager:
         )
         self._all_tasks = sorted(list(self._task_index.keys()))
 
+        self._all_groups = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
+        )
+        self._all_subtasks = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "task"]
+        )
+        self._all_tags = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
+        )
+
         self.task_group_map = collections.defaultdict(list)
 
     def initialize_tasks(
@@ -73,10 +83,88 @@ class TaskManager:
     def all_tasks(self):
         return self._all_tasks
 
+    @property
+    def all_groups(self):
+        return self._all_groups
+
+    @property
+    def all_subtasks(self):
+        return self._all_subtasks
+
+    @property
+    def all_tags(self):
+        return self._all_tags
+
     @property
     def task_index(self):
         return self._task_index
 
+    def list_all_tasks(
+        self, list_groups=True, list_tags=True, list_subtasks=True
+    ) -> str:
+        from pytablewriter import MarkdownTableWriter
+
+        def sanitize_path(path):
+            # don't print full path if we are within the lm_eval/tasks dir !
+            # if we aren't though, provide the full path.
+            if "lm_eval/tasks/" in path:
+                return "lm_eval/tasks/" + path.split("lm_eval/tasks/")[-1]
+            else:
+                return path
+
+        group_table = MarkdownTableWriter()
+        group_table.headers = ["Group", "Config Location"]
+        gt_values = []
+        for g in self.all_groups:
+            path = self.task_index[g]["yaml_path"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            gt_values.append([g, path])
+        group_table.value_matrix = gt_values
+
+        tag_table = MarkdownTableWriter()
+        tag_table.headers = ["Tag"]
+        tag_table.value_matrix = [[t] for t in self.all_tags]
+
+        subtask_table = MarkdownTableWriter()
+        subtask_table.headers = ["Task", "Config Location", "Output Type"]
+        st_values = []
+        for t in self.all_subtasks:
+            path = self.task_index[t]["yaml_path"]
+
+            output_type = ""
+
+            # read the yaml file to determine the output type
+            if path != -1:
+                config = utils.load_yaml_config(path, mode="simple")
+                if "output_type" in config:
+                    output_type = config["output_type"]
+                elif (
+                    "include" in config
+                ):  # if no output type, check if there is an include with an output type
+                    include_path = path.split("/")[:-1] + config["include"]
+                    include_config = utils.load_yaml_config(include_path, mode="simple")
+                    if "output_type" in include_config:
+                        output_type = include_config["output_type"]
+
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            st_values.append([t, path, output_type])
+        subtask_table.value_matrix = st_values
+
+        result = "\n"
+        if list_groups:
+            result += group_table.dumps() + "\n\n"
+        if list_tags:
+            result += tag_table.dumps() + "\n\n"
+        if list_subtasks:
+            result += subtask_table.dumps() + "\n\n"
+        return result
+
     def match_tasks(self, task_list):
         return utils.pattern_match(task_list, self.all_tasks)
 
-- 
GitLab


From eeec6dae13cffef493ec4d716f355788684ae368 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 12 Jul 2024 02:09:24 -0400
Subject: [PATCH 31/45] make RougeScorer only initialized once (#2090)

---
 lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py |  8 +++++++-
 lm_eval/tasks/truthfulqa/utils.py                | 11 ++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py b/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
index 2f7d92d1..e404d096 100644
--- a/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
+++ b/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
@@ -6,6 +6,8 @@ from rouge_score import rouge_scorer, scoring
 
 """ This code mirrors the utils of the original truthful_qa task """
 
+ROUGE_SCORER = None
+
 
 def process_results_mc2(doc, results):
     lls, is_greedy = zip(*results)
@@ -153,7 +155,11 @@ def rouge(refs, preds):
         A `list` of predicted `strs`.
     """
     rouge_types = ["rouge1", "rouge2", "rougeLsum"]
-    scorer = rouge_scorer.RougeScorer(rouge_types)
+    global ROUGE_SCORER
+    if ROUGE_SCORER is None:
+        # init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
+        ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
+    scorer = ROUGE_SCORER
     # Add newlines between sentences to correctly compute `rougeLsum`.
 
     def _prepare_summary(summary):
diff --git a/lm_eval/tasks/truthfulqa/utils.py b/lm_eval/tasks/truthfulqa/utils.py
index 399969ca..3f231755 100644
--- a/lm_eval/tasks/truthfulqa/utils.py
+++ b/lm_eval/tasks/truthfulqa/utils.py
@@ -4,6 +4,9 @@ import sacrebleu
 from rouge_score import rouge_scorer, scoring
 
 
+ROUGE_SCORER = None
+
+
 def process_results_mc2(doc, results):
     lls, is_greedy = zip(*results)
 
@@ -149,8 +152,14 @@ def rouge(refs, preds):
     :param preds:
         A `list` of predicted `strs`.
     """
+
     rouge_types = ["rouge1", "rouge2", "rougeLsum"]
-    scorer = rouge_scorer.RougeScorer(rouge_types)
+
+    global ROUGE_SCORER
+    if ROUGE_SCORER is None:
+        # init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
+        ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
+    scorer = ROUGE_SCORER
     # Add newlines between sentences to correctly compute `rougeLsum`.
 
     def _prepare_summary(summary):
-- 
GitLab


From cdd954f96e3930a860cb9b1871a38e427d2ddefd Mon Sep 17 00:00:00 2001
From: Wonung Kim <waneon.kim@gmail.com>
Date: Fri, 12 Jul 2024 16:19:14 +0900
Subject: [PATCH 32/45] Update default.yaml (#2092)

---
 lm_eval/tasks/winogrande/default.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lm_eval/tasks/winogrande/default.yaml b/lm_eval/tasks/winogrande/default.yaml
index 213f0727..12e0077a 100644
--- a/lm_eval/tasks/winogrande/default.yaml
+++ b/lm_eval/tasks/winogrande/default.yaml
@@ -15,3 +15,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
-- 
GitLab


From d5f39bf8c73eb22203909b6196c8822f16bcefb3 Mon Sep 17 00:00:00 2001
From: SuperCat <37853425+SkySuperCat@users.noreply.github.com>
Date: Fri, 12 Jul 2024 00:22:24 -0700
Subject: [PATCH 33/45] Add new dataset MMLU-SR tasks (#2032)

* add mmlusr tasks

* renamed all tasks names in mmlusr

* edit format and readme

* added mmlu_sr

* mmlu_sr -> mmlusr

* update

---------

Co-authored-by: lintangsutawika <lintang@eleuther.ai>
---
 lm_eval/tasks/README.md                       |   1 +
 lm_eval/tasks/mmlusr/README.md                |  64 ++++++++
 .../mmlusr/answer_only/_answer_only.yaml      |  44 +++++
 .../tasks/mmlusr/answer_only/_mmlusr_a_yml    |  16 ++
 .../answer_only_abstract_algebra.yaml         |   7 +
 .../answer_only/answer_only_anatomy.yaml      |   7 +
 .../answer_only/answer_only_astronomy.yaml    |   7 +
 .../answer_only_business_ethics.yaml          |   7 +
 .../answer_only_clinical_knowledge.yaml       |   7 +
 .../answer_only_college_biology.yaml          |   7 +
 .../answer_only_college_chemistry.yaml        |   7 +
 .../answer_only_college_computer_science.yaml |   7 +
 .../answer_only_college_mathematics.yaml      |   7 +
 .../answer_only_college_medicine.yaml         |   7 +
 .../answer_only_college_physics.yaml          |   7 +
 .../answer_only_computer_security.yaml        |   7 +
 .../answer_only_conceptual_physics.yaml       |   7 +
 .../answer_only/answer_only_econometrics.yaml |   7 +
 .../answer_only_electrical_engineering.yaml   |   7 +
 .../answer_only_elementary_mathematics.yaml   |   7 +
 .../answer_only/answer_only_formal_logic.yaml |   7 +
 .../answer_only/answer_only_global_facts.yaml |   7 +
 .../answer_only_high_school_biology.yaml      |   7 +
 .../answer_only_high_school_chemistry.yaml    |   7 +
 ...wer_only_high_school_computer_science.yaml |   7 +
 ...wer_only_high_school_european_history.yaml |   7 +
 .../answer_only_high_school_geography.yaml    |   7 +
 ...y_high_school_government_and_politics.yaml |   7 +
 ...nswer_only_high_school_macroeconomics.yaml |   7 +
 .../answer_only_high_school_mathematics.yaml  |   7 +
 ...nswer_only_high_school_microeconomics.yaml |   7 +
 .../answer_only_high_school_physics.yaml      |   7 +
 .../answer_only_high_school_psychology.yaml   |   7 +
 .../answer_only_high_school_statistics.yaml   |   7 +
 .../answer_only_high_school_us_history.yaml   |   7 +
 ...answer_only_high_school_world_history.yaml |   7 +
 .../answer_only/answer_only_human_aging.yaml  |   7 +
 .../answer_only_human_sexuality.yaml          |   7 +
 .../answer_only_international_law.yaml        |   7 +
 .../answer_only_jurisprudence.yaml            |   7 +
 .../answer_only_logical_fallacies.yaml        |   7 +
 .../answer_only_machine_learning.yaml         |   7 +
 .../answer_only/answer_only_management.yaml   |   7 +
 .../answer_only/answer_only_marketing.yaml    |   7 +
 .../answer_only_medical_genetics.yaml         |   7 +
 .../answer_only_miscellaneous.yaml            |   7 +
 .../answer_only_moral_disputes.yaml           |   7 +
 .../answer_only_moral_scenarios.yaml          |   7 +
 .../answer_only/answer_only_nutrition.yaml    |   7 +
 .../answer_only/answer_only_philosophy.yaml   |   7 +
 .../answer_only/answer_only_prehistory.yaml   |   7 +
 .../answer_only_professional_accounting.yaml  |   7 +
 .../answer_only_professional_law.yaml         |   7 +
 .../answer_only_professional_medicine.yaml    |   7 +
 .../answer_only_professional_psychology.yaml  |   7 +
 .../answer_only_public_relations.yaml         |   7 +
 .../answer_only_security_studies.yaml         |   7 +
 .../answer_only/answer_only_sociology.yaml    |   7 +
 .../answer_only_us_foreign_policy.yaml        |   7 +
 .../answer_only/answer_only_virology.yaml     |   7 +
 .../answer_only_world_religions.yaml          |   7 +
 lm_eval/tasks/mmlusr/answer_only/utils.py     |  19 +++
 lm_eval/tasks/mmlusr/config.py                | 154 ++++++++++++++++++
 .../question_and_answer/_mmlusr_qna_yml       |  16 ++
 .../_question_and_answer.yaml                 |  44 +++++
 .../question_and_answer_abstract_algebra.yaml |   7 +
 .../question_and_answer_anatomy.yaml          |   7 +
 .../question_and_answer_astronomy.yaml        |   7 +
 .../question_and_answer_business_ethics.yaml  |   7 +
 ...uestion_and_answer_clinical_knowledge.yaml |   7 +
 .../question_and_answer_college_biology.yaml  |   7 +
 ...question_and_answer_college_chemistry.yaml |   7 +
 ...n_and_answer_college_computer_science.yaml |   7 +
 ...estion_and_answer_college_mathematics.yaml |   7 +
 .../question_and_answer_college_medicine.yaml |   7 +
 .../question_and_answer_college_physics.yaml  |   7 +
 ...question_and_answer_computer_security.yaml |   7 +
 ...uestion_and_answer_conceptual_physics.yaml |   7 +
 .../question_and_answer_econometrics.yaml     |   7 +
 ...ion_and_answer_electrical_engineering.yaml |   7 +
 ...ion_and_answer_elementary_mathematics.yaml |   7 +
 .../question_and_answer_formal_logic.yaml     |   7 +
 .../question_and_answer_global_facts.yaml     |   7 +
 ...estion_and_answer_high_school_biology.yaml |   7 +
 ...tion_and_answer_high_school_chemistry.yaml |   7 +
 ...d_answer_high_school_computer_science.yaml |   7 +
 ...d_answer_high_school_european_history.yaml |   7 +
 ...tion_and_answer_high_school_geography.yaml |   7 +
 ...r_high_school_government_and_politics.yaml |   7 +
 ...and_answer_high_school_macroeconomics.yaml |   7 +
 ...on_and_answer_high_school_mathematics.yaml |   7 +
 ...and_answer_high_school_microeconomics.yaml |   7 +
 ...estion_and_answer_high_school_physics.yaml |   7 +
 ...ion_and_answer_high_school_psychology.yaml |   7 +
 ...ion_and_answer_high_school_statistics.yaml |   7 +
 ...ion_and_answer_high_school_us_history.yaml |   7 +
 ..._and_answer_high_school_world_history.yaml |   7 +
 .../question_and_answer_human_aging.yaml      |   7 +
 .../question_and_answer_human_sexuality.yaml  |   7 +
 ...question_and_answer_international_law.yaml |   7 +
 .../question_and_answer_jurisprudence.yaml    |   7 +
 ...question_and_answer_logical_fallacies.yaml |   7 +
 .../question_and_answer_machine_learning.yaml |   7 +
 .../question_and_answer_management.yaml       |   7 +
 .../question_and_answer_marketing.yaml        |   7 +
 .../question_and_answer_medical_genetics.yaml |   7 +
 .../question_and_answer_miscellaneous.yaml    |   7 +
 .../question_and_answer_moral_disputes.yaml   |   7 +
 .../question_and_answer_moral_scenarios.yaml  |   7 +
 .../question_and_answer_nutrition.yaml        |   7 +
 .../question_and_answer_philosophy.yaml       |   7 +
 .../question_and_answer_prehistory.yaml       |   7 +
 ...on_and_answer_professional_accounting.yaml |   7 +
 .../question_and_answer_professional_law.yaml |   7 +
 ...tion_and_answer_professional_medicine.yaml |   7 +
 ...on_and_answer_professional_psychology.yaml |   7 +
 .../question_and_answer_public_relations.yaml |   7 +
 .../question_and_answer_security_studies.yaml |   7 +
 .../question_and_answer_sociology.yaml        |   7 +
 ...question_and_answer_us_foreign_policy.yaml |   7 +
 .../question_and_answer_virology.yaml         |   7 +
 .../question_and_answer_world_religions.yaml  |   7 +
 .../tasks/mmlusr/question_and_answer/utils.py |  19 +++
 .../tasks/mmlusr/question_only/_mmlusr_q_yml  |  16 ++
 .../mmlusr/question_only/_question_only.yaml  |  44 +++++
 .../question_only_abstract_algebra.yaml       |   7 +
 .../question_only/question_only_anatomy.yaml  |   7 +
 .../question_only_astronomy.yaml              |   7 +
 .../question_only_business_ethics.yaml        |   7 +
 .../question_only_clinical_knowledge.yaml     |   7 +
 .../question_only_college_biology.yaml        |   7 +
 .../question_only_college_chemistry.yaml      |   7 +
 ...uestion_only_college_computer_science.yaml |   7 +
 .../question_only_college_mathematics.yaml    |   7 +
 .../question_only_college_medicine.yaml       |   7 +
 .../question_only_college_physics.yaml        |   7 +
 .../question_only_computer_security.yaml      |   7 +
 .../question_only_conceptual_physics.yaml     |   7 +
 .../question_only_econometrics.yaml           |   7 +
 .../question_only_electrical_engineering.yaml |   7 +
 .../question_only_elementary_mathematics.yaml |   7 +
 .../question_only_formal_logic.yaml           |   7 +
 .../question_only_global_facts.yaml           |   7 +
 .../question_only_high_school_biology.yaml    |   7 +
 .../question_only_high_school_chemistry.yaml  |   7 +
 ...ion_only_high_school_computer_science.yaml |   7 +
 ...ion_only_high_school_european_history.yaml |   7 +
 .../question_only_high_school_geography.yaml  |   7 +
 ...y_high_school_government_and_politics.yaml |   7 +
 ...stion_only_high_school_macroeconomics.yaml |   7 +
 ...question_only_high_school_mathematics.yaml |   7 +
 ...stion_only_high_school_microeconomics.yaml |   7 +
 .../question_only_high_school_physics.yaml    |   7 +
 .../question_only_high_school_psychology.yaml |   7 +
 .../question_only_high_school_statistics.yaml |   7 +
 .../question_only_high_school_us_history.yaml |   7 +
 ...estion_only_high_school_world_history.yaml |   7 +
 .../question_only_human_aging.yaml            |   7 +
 .../question_only_human_sexuality.yaml        |   7 +
 .../question_only_international_law.yaml      |   7 +
 .../question_only_jurisprudence.yaml          |   7 +
 .../question_only_logical_fallacies.yaml      |   7 +
 .../question_only_machine_learning.yaml       |   7 +
 .../question_only_management.yaml             |   7 +
 .../question_only_marketing.yaml              |   7 +
 .../question_only_medical_genetics.yaml       |   7 +
 .../question_only_miscellaneous.yaml          |   7 +
 .../question_only_moral_disputes.yaml         |   7 +
 .../question_only_moral_scenarios.yaml        |   7 +
 .../question_only_nutrition.yaml              |   7 +
 .../question_only_philosophy.yaml             |   7 +
 .../question_only_prehistory.yaml             |   7 +
 ...question_only_professional_accounting.yaml |   7 +
 .../question_only_professional_law.yaml       |   7 +
 .../question_only_professional_medicine.yaml  |   7 +
 ...question_only_professional_psychology.yaml |   7 +
 .../question_only_public_relations.yaml       |   7 +
 .../question_only_security_studies.yaml       |   7 +
 .../question_only_sociology.yaml              |   7 +
 .../question_only_us_foreign_policy.yaml      |   7 +
 .../question_only/question_only_virology.yaml |   7 +
 .../question_only_world_religions.yaml        |   7 +
 lm_eval/tasks/mmlusr/question_only/utils.py   |  19 +++
 183 files changed, 1653 insertions(+)
 create mode 100644 lm_eval/tasks/mmlusr/README.md
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/_mmlusr_a_yml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlusr/answer_only/utils.py
 create mode 100644 lm_eval/tasks/mmlusr/config.py
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_and_answer/utils.py
 create mode 100644 lm_eval/tasks/mmlusr/question_only/_mmlusr_q_yml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/_question_only.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_management.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml
 create mode 100644 lm_eval/tasks/mmlusr/question_only/utils.py

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 1eb7fdc5..c6e4c4d7 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -67,6 +67,7 @@
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
 | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
 | mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
 | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
 | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
 | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
diff --git a/lm_eval/tasks/mmlusr/README.md b/lm_eval/tasks/mmlusr/README.md
new file mode 100644
index 00000000..6d8a79fb
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/README.md
@@ -0,0 +1,64 @@
+# MMLU-SR
+
+## Paper
+Title: [Reasoning or Simply Next Token Prediction? A Benchmark for Stress-Testing Large Language Models](https://arxiv.org/abs/2406.15468v1)
+
+
+We propose MMLU-SR, a novel dataset designed to measure the true comprehension abilities of Large Language Models (LLMs) by challenging their performance in question-answering tasks with modified terms. We reasoned that an agent that ``truly'' understands a concept can still evaluate it when key terms are replaced by suitably defined alternate terms, and sought to differentiate such comprehension from mere text replacement. In our study, we modified standardized test questions by replacing a key term with a dummy word along with its definition. The key term could be in the context of questions, answers, or both questions and answers.
+Notwithstanding the high scores achieved by recent popular LLMs on the MMLU leaderboard, we found a substantial reduction in model performance after such replacement, suggesting poor comprehension. This new benchmark provides a rigorous benchmark for testing true model comprehension, and poses a challenge to the broader scientific community.
+
+Github Homepage: [https://github.com/Wang-ML-Lab/MMLU-SR](https://github.com/Wang-ML-Lab/MMLU-SR)
+Huggingface Dataset: [https://huggingface.co/datasets/NiniCat/MMLU-SR]([https://huggingface.co/datasets/NiniCat/MMLU-SR)
+
+
+## Citation
+```bib
+@misc{wang2024reasoningsimplytokenprediction,
+      title={Reasoning or Simply Next Token Prediction? A Benchmark for Stress-Testing Large Language Models},
+      author={Wentian Wang and Paul Kantor and Jacob Feldman and Lazaros Gallos and Hao Wang},
+      year={2024},
+      eprint={2406.15468},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.15468},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `mmlusr`: MMLU variant where the terminology in the question and answers are modified.
+- `mmlusr_answer_only`: MMLU variant where the terminology in the answers are modified.
+- `mmlusr_question_only`: MMLU variant where the terminology in the question is modified.
+
+#### Tasks
+
+There are 57 symbol replaced subjects in each group. You can run a single task by:
+
+* `mmlusr_question_only_abstract_algebra`
+
+Or by categories:
+
+* `mmlusr_question_only_stem_tasks `
+
+
+### Checklist
+
+The checklist is the following:
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Variant Wishlist
+
+- [ ] zero-shot variant
diff --git a/lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml b/lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml
new file mode 100644
index 00000000..eef906aa
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml
@@ -0,0 +1,44 @@
+group: mmlusr_answer_only
+group_alias: MMLU-SR (Answer Only)
+task:
+  - group: mmlusr_ao_stem
+    group_alias: STEM (Answer Only)
+    task:
+      - mmlusr_answer_only_stem_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_ao_other
+    group_alias: Other (Answer Only)
+    task:
+      - mmlusr_answer_only_other_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_ao_social_sciences
+    group_alias: Social Sciences (Answer Only)
+    task:
+      - mmlusr_answer_only_social_sciences_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_ao_humanities
+    group_alias: Humanities (Answer Only)
+    task:
+      - mmlusr_answer_only_humanities_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlusr/answer_only/_mmlusr_a_yml b/lm_eval/tasks/mmlusr/answer_only/_mmlusr_a_yml
new file mode 100644
index 00000000..cd307413
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/_mmlusr_a_yml
@@ -0,0 +1,16 @@
+dataset_path: NiniCat/MMLU-SR
+test_split: test
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml
new file mode 100644
index 00000000..527bc9cc
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_abstract_algebra"
+"task_alias": "abstract algebra"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml
new file mode 100644
index 00000000..1e4acc8c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_anatomy"
+"task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml
new file mode 100644
index 00000000..068072de
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_astronomy"
+"task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml
new file mode 100644
index 00000000..1e836e31
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_business_ethics"
+"task_alias": "business ethics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml
new file mode 100644
index 00000000..1ef70967
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_clinical_knowledge"
+"task_alias": "clinical knowledge"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml
new file mode 100644
index 00000000..b967895a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_biology"
+"task_alias": "college biology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml
new file mode 100644
index 00000000..8dd100e7
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_chemistry"
+"task_alias": "college chemistry"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml
new file mode 100644
index 00000000..bbd7e4c1
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_computer_science"
+"task_alias": "college computer science"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml
new file mode 100644
index 00000000..8d85c49d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_mathematics"
+"task_alias": "college mathematics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml
new file mode 100644
index 00000000..132e0b60
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_medicine"
+"task_alias": "college medicine"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml
new file mode 100644
index 00000000..77b47241
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_college_physics"
+"task_alias": "college physics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml
new file mode 100644
index 00000000..ba3d60d5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_computer_security"
+"task_alias": "computer security"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml
new file mode 100644
index 00000000..e0a84ecc
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_conceptual_physics"
+"task_alias": "conceptual physics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml
new file mode 100644
index 00000000..996d44f4
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_econometrics"
+"task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml
new file mode 100644
index 00000000..ab695e6a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_electrical_engineering"
+"task_alias": "electrical engineering"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml
new file mode 100644
index 00000000..dff9fbf2
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_elementary_mathematics"
+"task_alias": "elementary mathematics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml
new file mode 100644
index 00000000..e26ed865
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_formal_logic"
+"task_alias": "formal logic"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml
new file mode 100644
index 00000000..ec9c0f42
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_global_facts"
+"task_alias": "global facts"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml
new file mode 100644
index 00000000..41ed53cb
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_biology"
+"task_alias": "high school biology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml
new file mode 100644
index 00000000..95a3303f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_chemistry"
+"task_alias": "high school chemistry"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml
new file mode 100644
index 00000000..e665fb34
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_computer_science"
+"task_alias": "high school computer science"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml
new file mode 100644
index 00000000..9d7c1cb8
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_european_history"
+"task_alias": "high school european history"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml
new file mode 100644
index 00000000..8a498006
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_geography"
+"task_alias": "high school geography"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..bf66e3a3
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_government_and_politics"
+"task_alias": "high school government and politics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..95e35cd8
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_macroeconomics"
+"task_alias": "high school macroeconomics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml
new file mode 100644
index 00000000..7da2d185
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_mathematics"
+"task_alias": "high school mathematics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml
new file mode 100644
index 00000000..e3af9a2c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_microeconomics"
+"task_alias": "high school microeconomics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml
new file mode 100644
index 00000000..52fb7377
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_physics"
+"task_alias": "high school physics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml
new file mode 100644
index 00000000..df77619c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_psychology"
+"task_alias": "high school psychology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml
new file mode 100644
index 00000000..2119fb39
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_statistics"
+"task_alias": "high school statistics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml
new file mode 100644
index 00000000..2287ae45
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_us_history"
+"task_alias": "high school us history"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml
new file mode 100644
index 00000000..5b8f4f37
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_high_school_world_history"
+"task_alias": "high school world history"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml
new file mode 100644
index 00000000..6a188ddb
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_human_aging"
+"task_alias": "human aging"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml
new file mode 100644
index 00000000..18c45333
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_human_sexuality"
+"task_alias": "human sexuality"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml
new file mode 100644
index 00000000..05e482d1
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_international_law"
+"task_alias": "international law"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml
new file mode 100644
index 00000000..73edd6cb
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_jurisprudence"
+"task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml
new file mode 100644
index 00000000..ab18c927
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_logical_fallacies"
+"task_alias": "logical fallacies"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml
new file mode 100644
index 00000000..1b833c70
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"tag": "mmlusr_answer_only_stem_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_machine_learning"
+"task_alias": "machine learning"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml
new file mode 100644
index 00000000..26ec6740
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_management"
+"task_alias": "management"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml
new file mode 100644
index 00000000..23fe0365
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_marketing"
+"task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml
new file mode 100644
index 00000000..63355c88
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_medical_genetics"
+"task_alias": "medical genetics"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml
new file mode 100644
index 00000000..12153929
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_miscellaneous"
+"task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml
new file mode 100644
index 00000000..2f09854f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_moral_disputes"
+"task_alias": "moral disputes"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml
new file mode 100644
index 00000000..dee1c01e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_moral_scenarios"
+"task_alias": "moral scenarios"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml
new file mode 100644
index 00000000..a890f933
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_nutrition"
+"task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml
new file mode 100644
index 00000000..538dea75
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_philosophy"
+"task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml
new file mode 100644
index 00000000..a93b5c4f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_prehistory"
+"task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml
new file mode 100644
index 00000000..b9f45995
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_professional_accounting"
+"task_alias": "professional accounting"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml
new file mode 100644
index 00000000..caccccf0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_professional_law"
+"task_alias": "professional law"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml
new file mode 100644
index 00000000..374b239c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_professional_medicine"
+"task_alias": "professional medicine"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml
new file mode 100644
index 00000000..58a9fc2d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_professional_psychology"
+"task_alias": "professional psychology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml
new file mode 100644
index 00000000..86cc337b
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_public_relations"
+"task_alias": "public relations"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml
new file mode 100644
index 00000000..5e72f02f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_security_studies"
+"task_alias": "security studies"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml
new file mode 100644
index 00000000..58fa3d8d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_sociology"
+"task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml
new file mode 100644
index 00000000..91a6d663
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"tag": "mmlusr_answer_only_social_sciences_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_us_foreign_policy"
+"task_alias": "us foreign policy"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml
new file mode 100644
index 00000000..1400fb84
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"tag": "mmlusr_answer_only_other_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_virology"
+"task_alias": "virology"
diff --git a/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml b/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml
new file mode 100644
index 00000000..60142135
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "answer_only_world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_world_religions"
+"task_alias": "world religions"
diff --git a/lm_eval/tasks/mmlusr/answer_only/utils.py b/lm_eval/tasks/mmlusr/answer_only/utils.py
new file mode 100644
index 00000000..f1341bd5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/answer_only/utils.py
@@ -0,0 +1,19 @@
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _helper(doc):
+        # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D'
+        answer_list = ["A", "B", "C", "D"]
+        # Convert numeric index to corresponding letter
+        answer_index = int(doc["answer"])  # Make sure the answer is an integer
+        answer_letter = answer_list[answer_index]
+
+        out_doc = {
+            "questions": doc["question"],
+            "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]],
+            "answer": answer_letter,  # Include the letter for clarity
+        }
+        return out_doc
+
+    return dataset.map(_helper)
diff --git a/lm_eval/tasks/mmlusr/config.py b/lm_eval/tasks/mmlusr/config.py
new file mode 100644
index 00000000..527ebf1f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/config.py
@@ -0,0 +1,154 @@
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+SUBJECTS = {
+    "abstract_algebra": "stem",
+    "anatomy": "stem",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
+
+GROUPS = ["question_and_answer"]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate configuration YAML files for LM Evaluation Harness."
+    )
+    # Path to the base YAML file from which to inherit settings
+    parser.add_argument(
+        "--base_yaml_path",
+        required=True,
+        help="Path to the base YAML configuration file.",
+    )
+
+    # Directory where the generated YAML files will be saved
+    parser.add_argument(
+        "--save_dir",
+        default="/data/local/cat/lm-evaluation-harness/lm_eval/tasks/mmlusr/question_and_answer",
+    )
+
+    # Optional prefix to add to task names in the YAML files
+    parser.add_argument("--task_prefix", default="")
+
+    parser.add_argument("--cot_prompt_path", default=None)
+
+    # Optional prefix to add to group names in the YAML files
+    parser.add_argument("--group_prefix", default="")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # Load base YAML configuration
+    base_yaml_name = os.path.basename(args.base_yaml_path)
+    with open(args.base_yaml_path, "r", encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    for group in GROUPS:
+        for subject, category in tqdm(SUBJECTS.items()):
+            if args.cot_prompt_path is not None:
+                description = cot_file[subject]
+            else:
+                description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+            yaml_dict = {
+                "include": base_yaml_name,
+                "tag": f"mmlusr_{args.group_prefix}{group}_{category}"
+                if args.group_prefix
+                else f"mmlusr_{group}_{category}",
+                "task": f"mmlusr_{args.task_prefix}{group}_{subject}"
+                if args.task_prefix
+                else f"mmlusr_{group}_{subject}",
+                "task_alias": subject.replace("_", " "),
+                "description": description,
+                "dataset_name": f"{group}_{subject}",
+            }
+
+            # File path for saving the generated YAML file
+            file_save_path = os.path.join(args.save_dir, f"{group}_{subject}.yaml")
+            with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+                yaml.dump(yaml_dict, yaml_file, allow_unicode=True, default_style='"')
+            eval_logger.info(f"Saved YAML for {group} {subject} to {file_save_path}")
+
+    # Save group configuration if specified
+    if args.group_prefix:
+        file_save_path = os.path.join(
+            args.save_prefix_path, args.group_prefix + ".yaml"
+        )
+        eval_logger.info(f"Saving benchmark config to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(yaml_dict, yaml_file, indent=4, default_flow_style=False)
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml b/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml
new file mode 100644
index 00000000..cd307413
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml
@@ -0,0 +1,16 @@
+dataset_path: NiniCat/MMLU-SR
+test_split: test
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml b/lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml
new file mode 100644
index 00000000..0f757c34
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml
@@ -0,0 +1,44 @@
+group: mmlusr
+group_alias: MMLU-SR (Question & Answer)
+task:
+  - group: mmlusr_qa_stem
+    group_alias: STEM (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_stem_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_other
+    group_alias: Other (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_other_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_social_sciences
+    group_alias: Social Sciences (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_social_sciences_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_humanities
+    group_alias: Humanities (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_humanities_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml
new file mode 100644
index 00000000..bfdd80e6
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_abstract_algebra"
+"task_alias": "abstract algebra"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml
new file mode 100644
index 00000000..316bede4
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_anatomy"
+"task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml
new file mode 100644
index 00000000..e9f89e1c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_astronomy"
+"task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml
new file mode 100644
index 00000000..4a462982
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_business_ethics"
+"task_alias": "business ethics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml
new file mode 100644
index 00000000..c43c9a3d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_clinical_knowledge"
+"task_alias": "clinical knowledge"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml
new file mode 100644
index 00000000..4f00615b
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_biology"
+"task_alias": "college biology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml
new file mode 100644
index 00000000..837bc225
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_chemistry"
+"task_alias": "college chemistry"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml
new file mode 100644
index 00000000..bb1c7639
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_computer_science"
+"task_alias": "college computer science"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml
new file mode 100644
index 00000000..08c6e334
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_mathematics"
+"task_alias": "college mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml
new file mode 100644
index 00000000..5c44360a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_medicine"
+"task_alias": "college medicine"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml
new file mode 100644
index 00000000..372d6b9f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_physics"
+"task_alias": "college physics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml
new file mode 100644
index 00000000..8f85146a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_computer_security"
+"task_alias": "computer security"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml
new file mode 100644
index 00000000..7b37e192
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_conceptual_physics"
+"task_alias": "conceptual physics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml
new file mode 100644
index 00000000..d83b0ba9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_econometrics"
+"task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml
new file mode 100644
index 00000000..1898c3a9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_electrical_engineering"
+"task_alias": "electrical engineering"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml
new file mode 100644
index 00000000..c828feb1
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_elementary_mathematics"
+"task_alias": "elementary mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml
new file mode 100644
index 00000000..294e99a4
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_formal_logic"
+"task_alias": "formal logic"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml
new file mode 100644
index 00000000..79c1a879
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_global_facts"
+"task_alias": "global facts"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml
new file mode 100644
index 00000000..90fe29fa
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_biology"
+"task_alias": "high school biology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml
new file mode 100644
index 00000000..3e4423ae
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_chemistry"
+"task_alias": "high school chemistry"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml
new file mode 100644
index 00000000..fe1e3d49
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_computer_science"
+"task_alias": "high school computer science"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml
new file mode 100644
index 00000000..933d46f0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_european_history"
+"task_alias": "high school european history"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml
new file mode 100644
index 00000000..fa99ad15
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_geography"
+"task_alias": "high school geography"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..b4835f4e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_government_and_politics"
+"task_alias": "high school government and politics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..252ba9ce
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_macroeconomics"
+"task_alias": "high school macroeconomics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml
new file mode 100644
index 00000000..f88bf560
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_mathematics"
+"task_alias": "high school mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml
new file mode 100644
index 00000000..bef2656a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_microeconomics"
+"task_alias": "high school microeconomics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml
new file mode 100644
index 00000000..f02cc7fa
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_physics"
+"task_alias": "high school physics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml
new file mode 100644
index 00000000..df87039f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_psychology"
+"task_alias": "high school psychology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml
new file mode 100644
index 00000000..cbd9244a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_statistics"
+"task_alias": "high school statistics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml
new file mode 100644
index 00000000..efcf6898
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_us_history"
+"task_alias": "high school us history"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml
new file mode 100644
index 00000000..a6488f28
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_high_school_world_history"
+"task_alias": "high school world history"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml
new file mode 100644
index 00000000..b103d60f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_human_aging"
+"task_alias": "human aging"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml
new file mode 100644
index 00000000..eac93d56
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_human_sexuality"
+"task_alias": "human sexuality"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml
new file mode 100644
index 00000000..5f7d5403
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_international_law"
+"task_alias": "international law"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml
new file mode 100644
index 00000000..775bee63
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_jurisprudence"
+"task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml
new file mode 100644
index 00000000..1f2706a9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_logical_fallacies"
+"task_alias": "logical fallacies"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml
new file mode 100644
index 00000000..6299c4d0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_machine_learning"
+"task_alias": "machine learning"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml
new file mode 100644
index 00000000..60ae89e2
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_management"
+"task_alias": "management"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml
new file mode 100644
index 00000000..4399b96e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_marketing"
+"task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml
new file mode 100644
index 00000000..477b6cf9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_medical_genetics"
+"task_alias": "medical genetics"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml
new file mode 100644
index 00000000..204ea3ae
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_miscellaneous"
+"task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml
new file mode 100644
index 00000000..4ceb216f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_moral_disputes"
+"task_alias": "moral disputes"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml
new file mode 100644
index 00000000..d434fb1c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_moral_scenarios"
+"task_alias": "moral scenarios"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml
new file mode 100644
index 00000000..e564410f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_nutrition"
+"task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml
new file mode 100644
index 00000000..bf9c19bc
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_philosophy"
+"task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml
new file mode 100644
index 00000000..a9666690
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_prehistory"
+"task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml
new file mode 100644
index 00000000..68973e37
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_professional_accounting"
+"task_alias": "professional accounting"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml
new file mode 100644
index 00000000..a158fd12
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_professional_law"
+"task_alias": "professional law"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml
new file mode 100644
index 00000000..738e24e9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_professional_medicine"
+"task_alias": "professional medicine"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml
new file mode 100644
index 00000000..26f42c50
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_professional_psychology"
+"task_alias": "professional psychology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml
new file mode 100644
index 00000000..c92e6729
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_public_relations"
+"task_alias": "public relations"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml
new file mode 100644
index 00000000..9c5ba3c9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_security_studies"
+"task_alias": "security studies"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml
new file mode 100644
index 00000000..3d410986
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_sociology"
+"task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml
new file mode 100644
index 00000000..ced65cb6
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_us_foreign_policy"
+"task_alias": "us foreign policy"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml
new file mode 100644
index 00000000..da7c0ca5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_virology"
+"task_alias": "virology"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml
new file mode 100644
index 00000000..e44bd345
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_and_answer_world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"tag": "mmlusr_question_and_answer_humanities_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_world_religions"
+"task_alias": "world religions"
diff --git a/lm_eval/tasks/mmlusr/question_and_answer/utils.py b/lm_eval/tasks/mmlusr/question_and_answer/utils.py
new file mode 100644
index 00000000..f1341bd5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_and_answer/utils.py
@@ -0,0 +1,19 @@
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _helper(doc):
+        # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D'
+        answer_list = ["A", "B", "C", "D"]
+        # Convert numeric index to corresponding letter
+        answer_index = int(doc["answer"])  # Make sure the answer is an integer
+        answer_letter = answer_list[answer_index]
+
+        out_doc = {
+            "questions": doc["question"],
+            "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]],
+            "answer": answer_letter,  # Include the letter for clarity
+        }
+        return out_doc
+
+    return dataset.map(_helper)
diff --git a/lm_eval/tasks/mmlusr/question_only/_mmlusr_q_yml b/lm_eval/tasks/mmlusr/question_only/_mmlusr_q_yml
new file mode 100644
index 00000000..cd307413
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/_mmlusr_q_yml
@@ -0,0 +1,16 @@
+dataset_path: NiniCat/MMLU-SR
+test_split: test
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmlusr/question_only/_question_only.yaml b/lm_eval/tasks/mmlusr/question_only/_question_only.yaml
new file mode 100644
index 00000000..8d049ade
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/_question_only.yaml
@@ -0,0 +1,44 @@
+group: mmlusr_question_only
+group_alias: MMLU-SR (Question Only)
+task:
+  - group: mmlusr_qo_stem
+    group_alias: STEM (Question Only)
+    task:
+      - mmlusr_question_only_stem_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qo_other
+    group_alias: Other (Question Only)
+    task:
+      - mmlusr_question_only_other_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qo_social_sciences
+    group_alias: Social Sciences (Question Only)
+    task:
+      - mmlusr_question_only_social_sciences_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qo_humanities
+    group_alias: Humanities (Question Only)
+    task:
+      - mmlusr_question_only_humanities_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml
new file mode 100644
index 00000000..3ae764f7
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_abstract_algebra"
+"task_alias": "abstract algebra"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml
new file mode 100644
index 00000000..85fe7579
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_anatomy"
+"task_alias": "anatomy"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml
new file mode 100644
index 00000000..e32ddfed
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_astronomy"
+"task_alias": "astronomy"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml
new file mode 100644
index 00000000..2d640415
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_business_ethics"
+"task_alias": "business ethics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml
new file mode 100644
index 00000000..33398345
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_clinical_knowledge"
+"task_alias": "clinical knowledge"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml
new file mode 100644
index 00000000..940bddc2
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_biology"
+"task_alias": "college biology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml
new file mode 100644
index 00000000..dc7b6cda
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_chemistry"
+"task_alias": "college chemistry"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml
new file mode 100644
index 00000000..7feae9f0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_computer_science"
+"task_alias": "college computer science"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml
new file mode 100644
index 00000000..3c379c5f
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_mathematics"
+"task_alias": "college mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml
new file mode 100644
index 00000000..3f035787
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_medicine"
+"task_alias": "college medicine"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml
new file mode 100644
index 00000000..84e9599e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_college_physics"
+"task_alias": "college physics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml
new file mode 100644
index 00000000..7ac0de04
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_computer_security"
+"task_alias": "computer security"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml
new file mode 100644
index 00000000..75d50b14
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_conceptual_physics"
+"task_alias": "conceptual physics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml
new file mode 100644
index 00000000..edd501fa
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_econometrics"
+"task_alias": "econometrics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml
new file mode 100644
index 00000000..8be2f268
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_electrical_engineering"
+"task_alias": "electrical engineering"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml
new file mode 100644
index 00000000..0681dbc1
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_elementary_mathematics"
+"task_alias": "elementary mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml
new file mode 100644
index 00000000..51ae64f4
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_formal_logic"
+"description": "The following are multiple choice questions (with answers) about formal\
+  \ logic.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_formal_logic"
+"task_alias": "formal logic"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml
new file mode 100644
index 00000000..4fe24005
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_global_facts"
+"description": "The following are multiple choice questions (with answers) about global\
+  \ facts.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_global_facts"
+"task_alias": "global facts"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml
new file mode 100644
index 00000000..030fd2e0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_biology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school biology.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_biology"
+"task_alias": "high school biology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml
new file mode 100644
index 00000000..0f7b38e0
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_chemistry"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school chemistry.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_chemistry"
+"task_alias": "high school chemistry"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml
new file mode 100644
index 00000000..12f9d626
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_computer_science"
+"task_alias": "high school computer science"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml
new file mode 100644
index 00000000..746d125e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_european_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school european history.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_european_history"
+"task_alias": "high school european history"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml
new file mode 100644
index 00000000..abe2d6f5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_geography"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school geography.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_geography"
+"task_alias": "high school geography"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml
new file mode 100644
index 00000000..5a7fb24e
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_government_and_politics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school government and politics.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_government_and_politics"
+"task_alias": "high school government and politics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml
new file mode 100644
index 00000000..ecb07722
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_macroeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school macroeconomics.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_macroeconomics"
+"task_alias": "high school macroeconomics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml
new file mode 100644
index 00000000..aacf362d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_mathematics"
+"task_alias": "high school mathematics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml
new file mode 100644
index 00000000..dc288c97
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_microeconomics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school microeconomics.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_microeconomics"
+"task_alias": "high school microeconomics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml
new file mode 100644
index 00000000..aaa42363
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_physics"
+"task_alias": "high school physics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml
new file mode 100644
index 00000000..33085c5c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_psychology"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school psychology.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_psychology"
+"task_alias": "high school psychology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml
new file mode 100644
index 00000000..ae69628a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_statistics"
+"task_alias": "high school statistics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml
new file mode 100644
index 00000000..cf226b5a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_us_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school us history.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_us_history"
+"task_alias": "high school us history"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml
new file mode 100644
index 00000000..37b67158
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_high_school_world_history"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school world history.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_high_school_world_history"
+"task_alias": "high school world history"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml
new file mode 100644
index 00000000..2dd67daf
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_human_aging"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ aging.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_human_aging"
+"task_alias": "human aging"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml
new file mode 100644
index 00000000..bfaee537
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_human_sexuality"
+"description": "The following are multiple choice questions (with answers) about human\
+  \ sexuality.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_human_sexuality"
+"task_alias": "human sexuality"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml
new file mode 100644
index 00000000..fde60563
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_international_law"
+"description": "The following are multiple choice questions (with answers) about international\
+  \ law.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_international_law"
+"task_alias": "international law"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml
new file mode 100644
index 00000000..e2f95fd2
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_jurisprudence"
+"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\
+  \n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_jurisprudence"
+"task_alias": "jurisprudence"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml
new file mode 100644
index 00000000..8e07150c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_logical_fallacies"
+"description": "The following are multiple choice questions (with answers) about logical\
+  \ fallacies.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_logical_fallacies"
+"task_alias": "logical fallacies"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml
new file mode 100644
index 00000000..5bccaf4a
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"tag": "mmlusr_question_only_stem_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_machine_learning"
+"task_alias": "machine learning"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_management.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_management.yaml
new file mode 100644
index 00000000..ca72f214
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_management.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_management"
+"description": "The following are multiple choice questions (with answers) about management.\n\
+  \n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_management"
+"task_alias": "management"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml
new file mode 100644
index 00000000..a47f15b6
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_marketing"
+"description": "The following are multiple choice questions (with answers) about marketing.\n\
+  \n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_marketing"
+"task_alias": "marketing"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml
new file mode 100644
index 00000000..88829f61
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_medical_genetics"
+"description": "The following are multiple choice questions (with answers) about medical\
+  \ genetics.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_medical_genetics"
+"task_alias": "medical genetics"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml
new file mode 100644
index 00000000..ad3de694
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_miscellaneous"
+"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\
+  \n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_miscellaneous"
+"task_alias": "miscellaneous"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml
new file mode 100644
index 00000000..4a84f610
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_moral_disputes"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ disputes.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_moral_disputes"
+"task_alias": "moral disputes"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml
new file mode 100644
index 00000000..56ef6049
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_moral_scenarios"
+"description": "The following are multiple choice questions (with answers) about moral\
+  \ scenarios.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_moral_scenarios"
+"task_alias": "moral scenarios"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml
new file mode 100644
index 00000000..2518b48d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_nutrition"
+"description": "The following are multiple choice questions (with answers) about nutrition.\n\
+  \n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_nutrition"
+"task_alias": "nutrition"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml
new file mode 100644
index 00000000..e7c17c5d
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_philosophy"
+"description": "The following are multiple choice questions (with answers) about philosophy.\n\
+  \n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_philosophy"
+"task_alias": "philosophy"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml
new file mode 100644
index 00000000..2297b0f1
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_prehistory"
+"description": "The following are multiple choice questions (with answers) about prehistory.\n\
+  \n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_prehistory"
+"task_alias": "prehistory"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml
new file mode 100644
index 00000000..a0437411
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_professional_accounting"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ accounting.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_professional_accounting"
+"task_alias": "professional accounting"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml
new file mode 100644
index 00000000..8b8e572b
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_professional_law"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ law.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_professional_law"
+"task_alias": "professional law"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml
new file mode 100644
index 00000000..c25aa017
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_professional_medicine"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ medicine.\n\n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_professional_medicine"
+"task_alias": "professional medicine"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml
new file mode 100644
index 00000000..89ebc81c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_professional_psychology"
+"description": "The following are multiple choice questions (with answers) about professional\
+  \ psychology.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_professional_psychology"
+"task_alias": "professional psychology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml
new file mode 100644
index 00000000..d23cb2b9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_public_relations"
+"description": "The following are multiple choice questions (with answers) about public\
+  \ relations.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_public_relations"
+"task_alias": "public relations"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml
new file mode 100644
index 00000000..0ff913d9
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_security_studies"
+"description": "The following are multiple choice questions (with answers) about security\
+  \ studies.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_security_studies"
+"task_alias": "security studies"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml
new file mode 100644
index 00000000..d705e848
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_sociology"
+"description": "The following are multiple choice questions (with answers) about sociology.\n\
+  \n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_sociology"
+"task_alias": "sociology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml
new file mode 100644
index 00000000..7a9a7b87
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_us_foreign_policy"
+"description": "The following are multiple choice questions (with answers) about us\
+  \ foreign policy.\n\n"
+"tag": "mmlusr_question_only_social_sciences_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_us_foreign_policy"
+"task_alias": "us foreign policy"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml
new file mode 100644
index 00000000..034cfa8b
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_virology"
+"description": "The following are multiple choice questions (with answers) about virology.\n\
+  \n"
+"tag": "mmlusr_question_only_other_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_virology"
+"task_alias": "virology"
diff --git a/lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml b/lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml
new file mode 100644
index 00000000..4e66514c
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "question_only_world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"tag": "mmlusr_question_only_humanities_tasks"
+"include": "_mmlusr_q_yml"
+"task": "mmlusr_question_only_world_religions"
+"task_alias": "world religions"
diff --git a/lm_eval/tasks/mmlusr/question_only/utils.py b/lm_eval/tasks/mmlusr/question_only/utils.py
new file mode 100644
index 00000000..f1341bd5
--- /dev/null
+++ b/lm_eval/tasks/mmlusr/question_only/utils.py
@@ -0,0 +1,19 @@
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _helper(doc):
+        # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D'
+        answer_list = ["A", "B", "C", "D"]
+        # Convert numeric index to corresponding letter
+        answer_index = int(doc["answer"])  # Make sure the answer is an integer
+        answer_letter = answer_list[answer_index]
+
+        out_doc = {
+            "questions": doc["question"],
+            "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]],
+            "answer": answer_letter,  # Include the letter for clarity
+        }
+        return out_doc
+
+    return dataset.map(_helper)
-- 
GitLab


From 383bbd54bc621086e05aa1b030d8d4d5635b25e6 Mon Sep 17 00:00:00 2001
From: Jess <jessicaojo19@gmail.com>
Date: Fri, 12 Jul 2024 13:59:42 +0100
Subject: [PATCH 34/45] Irokobench: Benchmark Dataset for African languages
 (#2042)

* add afrixnli to task

* add chat completion

* remove chat completion -untested

* afrimmlu added

* afrimmlu folder update

* afrimmlu folder update

* updated prompt

* remove print

* add afrimgsm -direct

* add squad metric

* fix bash script

* remove direct util, update common yaml

* remove print

* add few show. metric fixes

* fix direct path, add bash script for gpt models

* added transate test

* update afrixnli tasks

* update afrixnli tasks

* update metrics for afrixnli

* prompt translations fix

* prompt translations fix

* filter and metric fix -mgsm

* remove squad metric

* remove squad metric

* add f1 score to mgsm

* add f1 score to mgsm

* update native-direct with lin

* change f1 function

* add lin to utils

* add utils

* remove test limit

* remove test configs

* add swahili to mmlu

* change eng to ewe in ewe yaml mmlu

* add squad metric to mgsm, remove whitespace filter

* added translate test

* added afrixnli_translate

* fix exact match valueError

* fix exact match valueError

* restructure mmlu folder

* spacing

* remove afrimmlu_translate folder

* add utility

* format task name, clean ups

* modefied mgsm

* update on afrimgsm

* update on afrimgsm

* removed utils

* other mgsm varieties

* other mgsm varieties

* adding trasnslate direct

* Update translate_direct_yaml

* add manual xnli prompt, add multichoice for openai models, and adapt multichoice metric for openai model

* edit for open models

* Update translate_direct_yaml

* add verbalizer for xnli

* change xnli from multiple choice to generate

* add manual accuracy scores

* revert xnli to multiple choice

* change afrimgsm utils

* revert xnli to multiple_choice

* cleanups and readmes

* remove openai fixes and unused regex

* pr review changes

* revert metrics.py, task.py and extraction.py to main version

---------

Co-authored-by: Israel Abebe Azime <azime@cg.uni-saarland.de>
Co-authored-by: Israel Abebe Azime <se.israel.abebe@gmail.com>
---
 lm_eval/api/metrics.py                        |   2 +-
 lm_eval/api/task.py                           |   2 +-
 lm_eval/filters/extraction.py                 |   2 +-
 lm_eval/tasks/afrimgsm/README.md              |  52 +++++
 .../afrimgsm/direct/afrimgsm_direct_amh.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_eng.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_ewe.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_fra.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_hau.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_ibo.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_kin.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_lin.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_lug.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_orm.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_sna.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_sot.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_swa.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_twi.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_wol.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_xho.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_yor.yaml  |  12 +
 .../afrimgsm/direct/afrimgsm_direct_zul.yaml  |  12 +
 lm_eval/tasks/afrimgsm/direct/direct_yaml     |  38 ++++
 .../afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml  |  12 +
 .../afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml  |  12 +
 lm_eval/tasks/afrimgsm/en_cot/cot_yaml        |  38 ++++
 lm_eval/tasks/afrimgsm/gen_yaml.sh            |   7 +
 lm_eval/tasks/afrimgsm/run.sh                 |   6 +
 .../translate/afrimgsm_translate_amh.yaml     |  12 +
 .../translate/afrimgsm_translate_eng.yaml     |  12 +
 .../translate/afrimgsm_translate_ewe.yaml     |  12 +
 .../translate/afrimgsm_translate_fra.yaml     |  12 +
 .../translate/afrimgsm_translate_hau.yaml     |  12 +
 .../translate/afrimgsm_translate_ibo.yaml     |  12 +
 .../translate/afrimgsm_translate_kin.yaml     |  12 +
 .../translate/afrimgsm_translate_lin.yaml     |  12 +
 .../translate/afrimgsm_translate_lug.yaml     |  12 +
 .../translate/afrimgsm_translate_orm.yaml     |  12 +
 .../translate/afrimgsm_translate_sna.yaml     |  12 +
 .../translate/afrimgsm_translate_sot.yaml     |  12 +
 .../translate/afrimgsm_translate_swa.yaml     |  12 +
 .../translate/afrimgsm_translate_twi.yaml     |  12 +
 .../translate/afrimgsm_translate_wol.yaml     |  12 +
 .../translate/afrimgsm_translate_xho.yaml     |  12 +
 .../translate/afrimgsm_translate_yor.yaml     |  12 +
 .../translate/afrimgsm_translate_zul.yaml     |  12 +
 .../afrimgsm/translate/translate_direct_yaml  |  37 +++
 lm_eval/tasks/afrimgsm/utils.py               | 197 ++++++++++++++++
 lm_eval/tasks/afrimmlu/README.md              |  50 +++++
 .../afrimmlu/direct/afrimmlu_common_yaml      |  38 ++++
 .../afrimmlu/direct/afrimmlu_direct_amh.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_eng.yaml  |   4 +
 .../afrimmlu/direct/afrimmlu_direct_ewe.yaml  |   4 +
 .../afrimmlu/direct/afrimmlu_direct_fra.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_hau.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_ibo.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_kin.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_lin.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_lug.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_orm.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_sna.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_sot.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_swa.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_twi.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_wol.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_xho.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_yor.yaml  |   3 +
 .../afrimmlu/direct/afrimmlu_direct_zul.yaml  |   3 +
 lm_eval/tasks/afrimmlu/direct/utils.py        |  43 ++++
 lm_eval/tasks/afrimmlu/fewshot.sh             |   8 +
 .../translate/afrimmlu_common_translate_yaml  |  35 +++
 .../translate/afrimmlu_translate_amh.yaml     |   3 +
 .../translate/afrimmlu_translate_eng.yaml     |   4 +
 .../translate/afrimmlu_translate_ewe.yaml     |   4 +
 .../translate/afrimmlu_translate_fra.yaml     |   3 +
 .../translate/afrimmlu_translate_hau.yaml     |   3 +
 .../translate/afrimmlu_translate_ibo.yaml     |   3 +
 .../translate/afrimmlu_translate_kin.yaml     |   3 +
 .../translate/afrimmlu_translate_lin.yaml     |   3 +
 .../translate/afrimmlu_translate_lug.yaml     |   3 +
 .../translate/afrimmlu_translate_orm.yaml     |   3 +
 .../translate/afrimmlu_translate_sna.yaml     |   3 +
 .../translate/afrimmlu_translate_sot.yaml     |   3 +
 .../translate/afrimmlu_translate_swa.yaml     |   3 +
 .../translate/afrimmlu_translate_twi.yaml     |   3 +
 .../translate/afrimmlu_translate_wol.yaml     |   3 +
 .../translate/afrimmlu_translate_xho.yaml     |   3 +
 .../translate/afrimmlu_translate_yor.yaml     |   3 +
 .../translate/afrimmlu_translate_zul.yaml     |   3 +
 lm_eval/tasks/afrimmlu/translate/utils.py     |  38 ++++
 lm_eval/tasks/afrimmlu/utils.py               |  38 ++++
 lm_eval/tasks/afrixnli/README.md              |  57 +++++
 .../en-direct/afrixnli_en_direct_amh.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_eng.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_ewe.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_fra.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_hau.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_ibo.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_kin.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_lin.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_lug.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_orm.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_sna.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_sot.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_swa.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_twi.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_wol.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_xho.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_yaml         |  35 +++
 .../en-direct/afrixnli_en_direct_yor.yaml     |   4 +
 .../en-direct/afrixnli_en_direct_zul.yaml     |   4 +
 .../afrixnli/anli prompt/en-direct/utils.py   |  18 ++
 .../afrixnli_native_direct_amh.yaml           |   6 +
 .../afrixnli_native_direct_eng.yaml           |   6 +
 .../afrixnli_native_direct_ewe.yaml           |   6 +
 .../afrixnli_native_direct_fra.yaml           |   6 +
 .../afrixnli_native_direct_hau.yaml           |   6 +
 .../afrixnli_native_direct_ibo.yaml           |   6 +
 .../afrixnli_native_direct_kin.yaml           |   6 +
 .../afrixnli_native_direct_lin.yaml           |   6 +
 .../afrixnli_native_direct_lug.yaml           |   6 +
 .../afrixnli_native_direct_orm.yaml           |   6 +
 .../afrixnli_native_direct_sna.yaml           |   6 +
 .../afrixnli_native_direct_sot.yaml           |   6 +
 .../afrixnli_native_direct_swa.yaml           |   6 +
 .../afrixnli_native_direct_twi.yaml           |   6 +
 .../afrixnli_native_direct_wol.yaml           |   6 +
 .../afrixnli_native_direct_xho.yaml           |   6 +
 .../native-direct/afrixnli_native_direct_yaml |  26 +++
 .../afrixnli_native_direct_yor.yaml           |   6 +
 .../afrixnli_native_direct_zul.yaml           |   6 +
 .../anli prompt/native-direct/utils.py        |   9 +
 .../translate/afrixnli_translate_amh.yaml     |   5 +
 .../translate/afrixnli_translate_ewe.yaml     |   4 +
 .../translate/afrixnli_translate_fra.yaml     |   4 +
 .../translate/afrixnli_translate_hau.yaml     |   4 +
 .../translate/afrixnli_translate_ibo.yaml     |   4 +
 .../translate/afrixnli_translate_kin.yaml     |   4 +
 .../translate/afrixnli_translate_lin.yaml     |   4 +
 .../translate/afrixnli_translate_lug.yaml     |   4 +
 .../translate/afrixnli_translate_orm.yaml     |   4 +
 .../translate/afrixnli_translate_sna.yaml     |   4 +
 .../translate/afrixnli_translate_sot.yaml     |   4 +
 .../translate/afrixnli_translate_swa.yaml     |   4 +
 .../translate/afrixnli_translate_twi.yaml     |   4 +
 .../translate/afrixnli_translate_wol.yaml     |   4 +
 .../translate/afrixnli_translate_xho.yaml     |   4 +
 .../translate/afrixnli_translate_yaml         |  33 +++
 .../translate/afrixnli_translate_yor.yaml     |   4 +
 .../translate/afrixnli_translate_zul.yaml     |   4 +
 .../afrixnli/anli prompt/translate/utils.py   |  18 ++
 .../direct/afrixnli_manual_direct_amh.yaml    |   4 +
 .../direct/afrixnli_manual_direct_eng.yaml    |   4 +
 .../direct/afrixnli_manual_direct_ewe.yaml    |   4 +
 .../direct/afrixnli_manual_direct_fra.yaml    |   4 +
 .../direct/afrixnli_manual_direct_hau.yaml    |   4 +
 .../direct/afrixnli_manual_direct_ibo.yaml    |   4 +
 .../direct/afrixnli_manual_direct_kin.yaml    |   4 +
 .../direct/afrixnli_manual_direct_lin.yaml    |   4 +
 .../direct/afrixnli_manual_direct_lug.yaml    |   4 +
 .../direct/afrixnli_manual_direct_orm.yaml    |   4 +
 .../direct/afrixnli_manual_direct_sna.yaml    |   4 +
 .../direct/afrixnli_manual_direct_sot.yaml    |   4 +
 .../direct/afrixnli_manual_direct_swa.yaml    |   4 +
 .../direct/afrixnli_manual_direct_twi.yaml    |   4 +
 .../direct/afrixnli_manual_direct_wol.yaml    |   4 +
 .../direct/afrixnli_manual_direct_xho.yaml    |   4 +
 .../direct/afrixnli_manual_direct_yaml        |  32 +++
 .../direct/afrixnli_manual_direct_yor.yaml    |   4 +
 .../direct/afrixnli_manual_direct_zul.yaml    |   4 +
 .../tasks/afrixnli/lai prompt/direct/utils.py |  32 +++
 .../afrixnli_manual_translate_amh.yaml        |   4 +
 .../afrixnli_manual_translate_ewe.yaml        |   4 +
 .../afrixnli_manual_translate_fra.yaml        |   4 +
 .../afrixnli_manual_translate_hau.yaml        |   4 +
 .../afrixnli_manual_translate_ibo.yaml        |   4 +
 .../afrixnli_manual_translate_kin.yaml        |   4 +
 .../afrixnli_manual_translate_lin.yaml        |   4 +
 .../afrixnli_manual_translate_lug.yaml        |   4 +
 .../afrixnli_manual_translate_orm.yaml        |   4 +
 .../afrixnli_manual_translate_sna.yaml        |   4 +
 .../afrixnli_manual_translate_sot.yaml        |   4 +
 .../afrixnli_manual_translate_swa.yaml        |   4 +
 .../afrixnli_manual_translate_twi.yaml        |   4 +
 .../afrixnli_manual_translate_wol.yaml        |   4 +
 .../afrixnli_manual_translate_xho.yaml        |   4 +
 .../translate/afrixnli_manual_translate_yaml  |  30 +++
 .../afrixnli_manual_translate_yor.yaml        |   4 +
 .../afrixnli_manual_translate_zul.yaml        |   4 +
 .../afrixnli/lai prompt/translate/utils.py    |  32 +++
 lm_eval/tasks/afrixnli/utils.py               | 212 ++++++++++++++++++
 207 files changed, 2311 insertions(+), 3 deletions(-)
 create mode 100644 lm_eval/tasks/afrimgsm/README.md
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/direct/direct_yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/en_cot/cot_yaml
 create mode 100644 lm_eval/tasks/afrimgsm/gen_yaml.sh
 create mode 100644 lm_eval/tasks/afrimgsm/run.sh
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml
 create mode 100644 lm_eval/tasks/afrimgsm/translate/translate_direct_yaml
 create mode 100644 lm_eval/tasks/afrimgsm/utils.py
 create mode 100644 lm_eval/tasks/afrimmlu/README.md
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/direct/utils.py
 create mode 100644 lm_eval/tasks/afrimmlu/fewshot.sh
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
 create mode 100644 lm_eval/tasks/afrimmlu/translate/utils.py
 create mode 100644 lm_eval/tasks/afrimmlu/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/README.md
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/native-direct/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml
 create mode 100644 lm_eval/tasks/afrixnli/anli prompt/translate/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/direct/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml
 create mode 100644 lm_eval/tasks/afrixnli/lai prompt/translate/utils.py
 create mode 100644 lm_eval/tasks/afrixnli/utils.py

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 1daf3847..5ea6b221 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
 
     assert len(metrics) == len(sizes)
 
-    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
\ No newline at end of file
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 030e6857..ec9caccf 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1665,4 +1665,4 @@ class PerplexityTask(Task):
     @classmethod
     def count_words(cls, doc) -> int:
         """Downstream tasks with custom word boundaries should override this!"""
-        return len(re.split(r"\s+", doc))
+        return len(re.split(r"\s+", doc))
\ No newline at end of file
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index 41dc6208..4f087451 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -181,4 +181,4 @@ class MultiChoiceRegexFilter(RegexFilter):
                 filtered.append(match)
             filtered_resps.append(filtered)
 
-        return filtered_resps
+        return filtered_resps
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimgsm/README.md b/lm_eval/tasks/afrimgsm/README.md
new file mode 100644
index 00000000..8f9d4619
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/README.md
@@ -0,0 +1,52 @@
+# MathQA
+
+### Paper
+
+IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
+https://arxiv.org/pdf/2406.03368
+
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
+low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
+
+
+### Citation
+
+```
+@misc{adelani2024irokobenchnewbenchmarkafrican,
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
+      year={2024},
+      eprint={2406.03368},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.03368}, 
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `afrimgsm`: All afrimgsm tasks
+* `afrimgsm_direct`: afrimgsm_direct evaluates models performance on the curated dataset
+* `afrimgsm_en_cot`: afrimgsm_en_cot includes 5-shot of exemplars for chain-of-thought approach
+* `afrimgsm_translate`: afrimgsm_translate evaluates models in translate-test setting
+
+#### Tasks
+* `afrimgsm_direct_{language_code}`: each task evaluates for one language
+* `afrimgsm_en_cot_{language_code}`: each task evaluates for one language
+* `afrimgsm_translate_{language_code}`: each task evaluates for one language
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
new file mode 100644
index 00000000..04d0bdd6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_amh
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
new file mode 100644
index 00000000..5804270d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_eng
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml
new file mode 100644
index 00000000..4eae6fc4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_ewe
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml
new file mode 100644
index 00000000..16aeacf2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_fra
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml
new file mode 100644
index 00000000..3a6668e9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_hau
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml
new file mode 100644
index 00000000..ab79986a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_ibo
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml
new file mode 100644
index 00000000..d4c9c75a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_kin
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml
new file mode 100644
index 00000000..7136d737
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_lin
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml
new file mode 100644
index 00000000..03fc0c28
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_lug
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml
new file mode 100644
index 00000000..49d7e933
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_orm
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml
new file mode 100644
index 00000000..a61de85a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_sna
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml
new file mode 100644
index 00000000..455c1adc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_sot
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml
new file mode 100644
index 00000000..462ddfd3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_swa
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml
new file mode 100644
index 00000000..8c4673b7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_twi
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml
new file mode 100644
index 00000000..08a8e030
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_wol
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml
new file mode 100644
index 00000000..2103d182
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_xho
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml
new file mode 100644
index 00000000..aa084c32
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_yor
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml
new file mode 100644
index 00000000..dcffb694
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_zul
diff --git a/lm_eval/tasks/afrimgsm/direct/direct_yaml b/lm_eval/tasks/afrimgsm/direct/direct_yaml
new file mode 100644
index 00000000..49f7940a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/direct_yaml
@@ -0,0 +1,38 @@
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group:
+    - mgsm
+    - afrimgsm
+    - afrimgsm_direct
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+# training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml
new file mode 100644
index 00000000..f00400d9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_amh
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml
new file mode 100644
index 00000000..c62bf206
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_eng
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml
new file mode 100644
index 00000000..ea246f7c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_ewe
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml
new file mode 100644
index 00000000..16bf57b7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_fra
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml
new file mode 100644
index 00000000..2a397baf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_hau
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml
new file mode 100644
index 00000000..9bd7bf62
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_ibo
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml
new file mode 100644
index 00000000..841913b7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_kin
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml
new file mode 100644
index 00000000..76d7fdb9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_lin
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml
new file mode 100644
index 00000000..84c05bb2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_lug
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml
new file mode 100644
index 00000000..e9e5600e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_orm
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml
new file mode 100644
index 00000000..05868962
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_sna
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml
new file mode 100644
index 00000000..ae443f18
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_sot
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml
new file mode 100644
index 00000000..1aa2d07d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_swa
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml
new file mode 100644
index 00000000..2957cb37
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_twi
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml
new file mode 100644
index 00000000..6ecf4c44
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_wol
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml
new file mode 100644
index 00000000..9dc6691b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_xho
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml
new file mode 100644
index 00000000..8ef29830
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_yor
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml
new file mode 100644
index 00000000..24f486e0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: afrimgsm_en_cot_zul
diff --git a/lm_eval/tasks/afrimgsm/en_cot/cot_yaml b/lm_eval/tasks/afrimgsm/en_cot/cot_yaml
new file mode 100644
index 00000000..b3f3273c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/en_cot/cot_yaml
@@ -0,0 +1,38 @@
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly by the harness.
+group:
+    - mgsm
+    - afrimgsm
+    - afrimgsm_en_cot
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/gen_yaml.sh b/lm_eval/tasks/afrimgsm/gen_yaml.sh
new file mode 100644
index 00000000..5c013282
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/gen_yaml.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# python utils.py --overwrite --output-dir direct --mode direct
+# python utils.py --overwrite --output-dir direct_native --mode direct-native
+# python utils.py --overwrite --output-dir en_cot --mode en-cot
+# python utils.py --overwrite --output-dir native_cot --mode native-cot
+python utils.py --overwrite --output-dir translate_direct --mode translate-direct
diff --git a/lm_eval/tasks/afrimgsm/run.sh b/lm_eval/tasks/afrimgsm/run.sh
new file mode 100644
index 00000000..370ffd9a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/run.sh
@@ -0,0 +1,6 @@
+lm_eval --model hf   \
+        --model_args pretrained="google/gemma-7b"  --tasks afrimgsm_en_cot_eng,mgsm_en_cot_en,afrimgsm_native_cot_eng,mgsm_native_cot_en,afrimgsm_direct_eng,mgsm_direct_en,afrimgsm_direct_native_eng  \
+        --device cuda:0     \
+        --batch_size 1  \
+        --verbosity DEBUG \
+        --limit 5
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml
new file mode 100644
index 00000000..55fbe4bf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_amh
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml
new file mode 100644
index 00000000..1d729a5c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_eng
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml
new file mode 100644
index 00000000..26191dc8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_ewe
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml
new file mode 100644
index 00000000..9f0331ee
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_fra
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml
new file mode 100644
index 00000000..850dad63
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_hau
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml
new file mode 100644
index 00000000..8b81178c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_ibo
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml
new file mode 100644
index 00000000..5a8f53e2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_kin
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml
new file mode 100644
index 00000000..58044ee2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_lin
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml
new file mode 100644
index 00000000..87013c14
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_lug
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml
new file mode 100644
index 00000000..1dd19325
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_orm
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml
new file mode 100644
index 00000000..d710b1da
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_sna
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml
new file mode 100644
index 00000000..643eaaee
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_sot
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml
new file mode 100644
index 00000000..b882e89c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_swa
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml
new file mode 100644
index 00000000..ac946eb7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_twi
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml
new file mode 100644
index 00000000..dbcc6b2e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_wol
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml
new file mode 100644
index 00000000..dfb3d74f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_xho
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml
new file mode 100644
index 00000000..6b4c346f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_yor
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml
new file mode 100644
index 00000000..5e79edff
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: translate_direct_yaml
+task: afrimgsm_translate_direct_zul
diff --git a/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml b/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml
new file mode 100644
index 00000000..39a27a6f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml
@@ -0,0 +1,37 @@
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group:
+    - mgsm
+    - afrimgsm
+    - afrimgsm_translate
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/utils.py b/lm_eval/tasks/afrimgsm/utils.py
new file mode 100644
index 00000000..8899e886
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/utils.py
@@ -0,0 +1,197 @@
+import argparse
+
+import yaml
+
+languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor',
+             'hau', 'sot', 'swa']
+
+languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)",
+                   "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+                   "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+                   'fra': "La réponse est(\\-?[0-9\\.\\,]+)",
+                   'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+                   'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)",
+                   'wol': "Tontu li (\\-?[0-9\\.\\,]+)",
+                   'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+                   'lug': "Ansa eri (\\-?[0-9\\.\\,]+)",
+                   'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+                   'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+                   'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+                   'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+                   'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+                   'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+                   'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+                   'sot': "Karabo ke (\\-?[0-9\\.\\,]+)",
+                   'swa': "Jibu ni (\\-?[0-9\\.\\,]+)",
+                   }
+
+LANGUAGES = {}
+
+for lang in languages:
+    if lang == 'amh':
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "ጥያቄ:",
+            "ANSWER": "በቅደም ተከተል መልስ:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang]}
+    elif lang == 'yor':
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "Ìbéèrè:",
+            "ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang]}
+
+    else:
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "Question:",
+            "ANSWER": "Step-by-Step Answer:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang]}
+
+
+def add_regex_pattern(regex_pattern):
+    if regex_pattern is None:
+        return {}
+    return {
+        "filter_list": [
+            {
+                "name": "strict-match",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": f"""{regex_pattern}""",
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+            {
+                "name": "flexible-extract",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
+                        "group_select": -1,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+        ],
+    }
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        try:
+
+            yaml_template = "cot_yaml"
+            filter_list = {}
+            DELIMITER = None
+            if mode == "direct":
+                ANSWER = LANGUAGES['eng']["DIRECT"]
+                QUESTION = LANGUAGES['eng']["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_direct_{lang}"
+                yaml_template = "direct_yaml"
+            if mode == "direct-native":
+                ANSWER = LANGUAGES[lang]["DIRECT"]
+                QUESTION = LANGUAGES[lang]["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_direct_native_{lang}"
+                yaml_template = "direct_native_yaml"
+            elif mode == "native-cot":
+                ANSWER = LANGUAGES[lang]["ANSWER"]
+                REGEX = LANGUAGES[lang]["REGEX"]
+                QUESTION = LANGUAGES[lang]["QUESTION"]
+                task_name = f"afrimgsm_native_cot_{lang}"
+                filter_list = add_regex_pattern(REGEX)
+                DELIMITER = "" if lang in ["zh", "ja"] else None
+            elif mode == "en-cot":
+                ANSWER = LANGUAGES["eng"]["ANSWER"]
+                REGEX = LANGUAGES["eng"]["REGEX"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
+                task_name = f"afrimgsm_en_cot_{lang}"
+            elif mode == "translate-direct":
+                ANSWER = LANGUAGES['eng']["DIRECT"]
+                QUESTION = LANGUAGES['eng']["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_translate_direct_{lang}"
+                yaml_template = "translate_direct_yaml"
+
+            file_name = f"{task_name}.yaml"
+            ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
+            with open(
+                    f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": yaml_template,
+                        "dataset_name": lang,
+                        "task": f"{task_name}",
+                        "doc_to_text": f"""{{% if answer is not none %}}"""
+                                       f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                                       f"""{{% else %}}"""
+                                       f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                                       f"""{{% endif %}}""",
+                        "doc_to_target": f"""{{% if answer is not none %}}"""
+                                         f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                                         f"""{{% else %}}"""
+                                         f"""{{{{answer_number|string}}}}"""
+                                         f"""{{% endif %}}""",
+                        **filter_list,
+                        "generation_kwargs": {
+                            "until": [QUESTION, "</s>", "<|im_end|>"],
+                            "do_sample": False,
+                        },
+                        **({"target_delimiter": DELIMITER} if DELIMITER else {}),
+                    },
+                    f,
+                    allow_unicode=True,
+                    width=float("inf"),
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    parser.add_argument(
+        "--mode",
+        default="native-cot",
+        choices=["direct", "direct-native", "native-cot", "en-cot", "translate-direct"],
+        help="Mode of chain-of-thought",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/README.md b/lm_eval/tasks/afrimmlu/README.md
new file mode 100644
index 00000000..b5129378
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/README.md
@@ -0,0 +1,50 @@
+# MathQA
+
+### Paper
+
+IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
+https://arxiv.org/pdf/2406.03368
+
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
+low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
+
+
+### Citation
+
+```
+@misc{adelani2024irokobenchnewbenchmarkafrican,
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
+      year={2024},
+      eprint={2406.03368},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.03368}, 
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `afrimmlu`: All afrimmlu tasks
+* `afrimmlu_direct`: afrimmlu_direct evaluates models performance on the curated dataset
+* `afrimmlu_translate`: afrimmlu_translate evaluates models in translate-test setting
+
+#### Tasks
+* `afrimmlu_direct_{language_code}`: each task evaluates for one language
+* `afrimmlu_translate_{language_code}`: each task evaluates for one language
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
new file mode 100644
index 00000000..8c42b8f5
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
@@ -0,0 +1,38 @@
+group:
+  - mmlu
+  - afrimmlu
+  - afrimmlu_direct
+task: null
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text 
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1 
+    aggregation: !function utils.weighted_f1_score 
+    # aggregation: mean
+    average: weighted 
+    hf_evaluate: true 
+    higher_is_better: True 
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
new file mode 100644
index 00000000..aa60c668
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
@@ -0,0 +1,3 @@
+dataset_name: amh
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_amh
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
new file mode 100644
index 00000000..f2add8a5
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+dataset_name: eng
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_eng
+
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
new file mode 100644
index 00000000..e0b00c08
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+dataset_name: ewe
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_ewe
+
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
new file mode 100644
index 00000000..6e3383b3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
@@ -0,0 +1,3 @@
+dataset_name: fra
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_fra
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
new file mode 100644
index 00000000..a28b083c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
@@ -0,0 +1,3 @@
+dataset_name: hau
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_hau
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
new file mode 100644
index 00000000..f9a863f2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,3 @@
+dataset_name: ibo
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_ibo
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
new file mode 100644
index 00000000..bfd008d4
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
@@ -0,0 +1,3 @@
+dataset_name: kin
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_kin
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
new file mode 100644
index 00000000..a541b655
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
@@ -0,0 +1,3 @@
+dataset_name: lin
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_lin
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
new file mode 100644
index 00000000..b6d0be05
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
@@ -0,0 +1,3 @@
+dataset_name: lug
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_lug
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
new file mode 100644
index 00000000..124bbe59
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
@@ -0,0 +1,3 @@
+dataset_name: orm
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_orm
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
new file mode 100644
index 00000000..9d75eb85
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
@@ -0,0 +1,3 @@
+dataset_name: sna
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_sna
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
new file mode 100644
index 00000000..fba23339
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
@@ -0,0 +1,3 @@
+dataset_name: sot
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_sot
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
new file mode 100644
index 00000000..099ecb0d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
@@ -0,0 +1,3 @@
+dataset_name: swa
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_swa
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
new file mode 100644
index 00000000..2da2e627
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
@@ -0,0 +1,3 @@
+dataset_name: twi
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_twi
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
new file mode 100644
index 00000000..938247ab
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
@@ -0,0 +1,3 @@
+dataset_name: wol
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_wol
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
new file mode 100644
index 00000000..bcaa7a62
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
@@ -0,0 +1,3 @@
+dataset_name: xho
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_xho
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
new file mode 100644
index 00000000..a83c8454
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
@@ -0,0 +1,3 @@
+dataset_name: yor
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_yor
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
new file mode 100644
index 00000000..a597d3bb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
@@ -0,0 +1,3 @@
+dataset_name: zul
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_zul
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/direct/utils.py b/lm_eval/tasks/afrimmlu/direct/utils.py
new file mode 100644
index 00000000..2250b4b3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/utils.py
@@ -0,0 +1,43 @@
+import re
+import sys
+import unicodedata
+
+from sklearn.metrics import f1_score
+from lm_eval.filters.extraction import RegexFilter
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+                model answers multiple-choice questions about {subject}
+                
+                Question: {question}
+
+                Choices:
+                        A: {choice1}
+                        B: {choice2}
+                        C: {choice3}
+                        D: {choice4}
+                       
+                Answer:  """
+    
+    choices = eval(doc["choices"])
+    text = output.format(subject=doc['subject'],
+                         question=doc['question'],
+                         choice1=choices[0],
+                         choice2=choices[1],
+                         choice3=choices[2],
+                         choice4=choices[3])
+    return text
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/fewshot.sh b/lm_eval/tasks/afrimmlu/fewshot.sh
new file mode 100644
index 00000000..42902ddd
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/fewshot.sh
@@ -0,0 +1,8 @@
+lm_eval --model hf \
+        --model_args pretrained=masakhane/African-ultrachat-alpaca  \
+        --tasks afrimmlu_direct_amh,afrimmlu_direct_eng,afrimmlu_direct_ewe,afrimmlu_direct_fra,afrimmlu_direct_hau,afrimmlu_direct_ibo,afrimmlu_direct_kin,afrimmlu_direct_lin,afrimmlu_direct_lug,afrimmlu_direct_orm,afrimmlu_direct_sna,afrimmlu_direct_sot,afrimmlu_direct_twi,afrimmlu_direct_wol,afrimmlu_direct_xho,afrimmlu_direct_yor,afrimmlu_direct_zul   \
+        --device cuda:0     \
+        --batch_size 1 \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --wandb_args project=afrimmlu
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
new file mode 100644
index 00000000..247a6779
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
@@ -0,0 +1,35 @@
+group:
+  - mmlu
+  - afrimmlu_translate
+task: null
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text 
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1 
+    aggregation: !function utils.weighted_f1_score 
+    # aggregation: mean
+    average: weighted 
+    hf_evaluate: true 
+    higher_is_better: True 
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
new file mode 100644
index 00000000..ac88ffa9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
@@ -0,0 +1,3 @@
+dataset_name: amh
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_amh
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
new file mode 100644
index 00000000..30c50078
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
@@ -0,0 +1,4 @@
+dataset_name: eng
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_eng
+
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
new file mode 100644
index 00000000..3c764f9c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+dataset_name: ewe
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_ewe
+
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
new file mode 100644
index 00000000..5401255c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
@@ -0,0 +1,3 @@
+dataset_name: fra
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_fra
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
new file mode 100644
index 00000000..590baa6e
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
@@ -0,0 +1,3 @@
+dataset_name: hau
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_hau
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
new file mode 100644
index 00000000..25c306d1
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,3 @@
+dataset_name: ibo
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_ibo
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
new file mode 100644
index 00000000..ef917724
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
@@ -0,0 +1,3 @@
+dataset_name: kin
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_kin
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
new file mode 100644
index 00000000..a07db090
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
@@ -0,0 +1,3 @@
+dataset_name: lin
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_lin
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
new file mode 100644
index 00000000..f23a9472
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
@@ -0,0 +1,3 @@
+dataset_name: lug
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_lug
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
new file mode 100644
index 00000000..bee6ff84
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
@@ -0,0 +1,3 @@
+dataset_name: orm
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_orm
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
new file mode 100644
index 00000000..ca99078d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
@@ -0,0 +1,3 @@
+dataset_name: sna
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_sna
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
new file mode 100644
index 00000000..3c617bc6
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
@@ -0,0 +1,3 @@
+dataset_name: sot
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_sot
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
new file mode 100644
index 00000000..a4baad77
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
@@ -0,0 +1,3 @@
+dataset_name: swa
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_swa
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
new file mode 100644
index 00000000..16436ff3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
@@ -0,0 +1,3 @@
+dataset_name: twi
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_twi
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
new file mode 100644
index 00000000..025139a1
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
@@ -0,0 +1,3 @@
+dataset_name: wol
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_wol
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
new file mode 100644
index 00000000..4404ab61
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
@@ -0,0 +1,3 @@
+dataset_name: xho
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_xho
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
new file mode 100644
index 00000000..48152ab2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
@@ -0,0 +1,3 @@
+dataset_name: yor
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_yor
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
new file mode 100644
index 00000000..b4fc8dfb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
@@ -0,0 +1,3 @@
+dataset_name: zul
+include: afrimmlu_common_translate_yaml
+task: afrimmlu_translate_zul
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/translate/utils.py b/lm_eval/tasks/afrimmlu/translate/utils.py
new file mode 100644
index 00000000..6054cf31
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/utils.py
@@ -0,0 +1,38 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+                model answers multiple-choice questions about '{subject}'
+                
+                Question: '''{question}'''
+
+                Choices:
+                        A: ''{choice1}'''
+                        B: ''{choice2}'''
+                        C: ''{choice3}'''
+                        D: ''{choice4}'''
+                       
+                Answer:  """
+    
+    choices = eval(doc["choices"])
+    text = output.format(subject=doc['subject'],
+                         question=doc['question'],
+                         choice1=choices[0],
+                         choice2=choices[1],
+                         choice3=choices[2],
+                         choice4=choices[3])
+    return text
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
\ No newline at end of file
diff --git a/lm_eval/tasks/afrimmlu/utils.py b/lm_eval/tasks/afrimmlu/utils.py
new file mode 100644
index 00000000..6054cf31
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/utils.py
@@ -0,0 +1,38 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+                model answers multiple-choice questions about '{subject}'
+                
+                Question: '''{question}'''
+
+                Choices:
+                        A: ''{choice1}'''
+                        B: ''{choice2}'''
+                        C: ''{choice3}'''
+                        D: ''{choice4}'''
+                       
+                Answer:  """
+    
+    choices = eval(doc["choices"])
+    text = output.format(subject=doc['subject'],
+                         question=doc['question'],
+                         choice1=choices[0],
+                         choice2=choices[1],
+                         choice3=choices[2],
+                         choice4=choices[3])
+    return text
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
\ No newline at end of file
diff --git a/lm_eval/tasks/afrixnli/README.md b/lm_eval/tasks/afrixnli/README.md
new file mode 100644
index 00000000..cf8e0ff2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/README.md
@@ -0,0 +1,57 @@
+# IrokoBench
+
+### Paper
+
+IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
+https://arxiv.org/pdf/2406.03368
+
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
+low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
+
+
+### Citation
+
+```
+@misc{adelani2024irokobenchnewbenchmarkafrican,
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
+      year={2024},
+      eprint={2406.03368},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.03368}, 
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `afrixnli`: All afrixnli tasks
+* `afrixnli_en_direct`: afrixnli_en_direct evaluates models performance using the anli prompt on the curated dataset
+* `afrixnli_native_direct`: afrixnli_native_direct evaluates models performance using the anli prompt translated to the 
+respective languages on the curated dataset
+* `afrixnli_translate`: afrixnli_translate evaluates models using the anli prompt in translate-test setting
+* `afrixnli_manual_direct`: afrixnli_manual_direct evaluates models performance using Lai's prompt on the curated dataset
+* `afrixnli_manual_translate`: afrixnli_manual_translate evaluates models using Lai's prompt in translate-test setting
+
+#### Tasks
+* `afrixnli_en_direct_{language_code}`: each task evaluates for one language
+* `afrixnli_native_direct_{language_code}`: each task evaluates for one language
+* `afrixnli_translate_{language_code}`: each task evaluates for one language
+* `afrixnli_manual_direct_{language_code}`: each task evaluates for one language
+* `afrixnli_manual_translate_{language_code}`: each task evaluates for one language
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml
new file mode 100644
index 00000000..3727f15a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_amh
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml
new file mode 100644
index 00000000..28c404d0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_eng
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml
new file mode 100644
index 00000000..075c1f07
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_ewe
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml
new file mode 100644
index 00000000..0cd6c0f8
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_fra
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml
new file mode 100644
index 00000000..100bad5f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_hau
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml
new file mode 100644
index 00000000..b8762c69
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_ibo
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml
new file mode 100644
index 00000000..1245d3da
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_kin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml
new file mode 100644
index 00000000..ac18bf6a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_lin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml
new file mode 100644
index 00000000..6375309f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_lug
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml
new file mode 100644
index 00000000..eb1c1c4f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_orm
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml
new file mode 100644
index 00000000..f7d94440
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_sna
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml
new file mode 100644
index 00000000..f90a2f5e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_sot
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml
new file mode 100644
index 00000000..bd81dde5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_swa
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml
new file mode 100644
index 00000000..49dd2b1e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_twi
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml
new file mode 100644
index 00000000..0de5a2fe
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_wol
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml
new file mode 100644
index 00000000..38e4ca57
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_xho
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yaml
new file mode 100644
index 00000000..df93915a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yaml	
@@ -0,0 +1,35 @@
+group:
+    - xnli
+    - afrixnli
+    - afrixnli_en_direct
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml
new file mode 100644
index 00000000..c43ffac0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_yor
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml
new file mode 100644
index 00000000..80d07893
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_en_direct_yaml
+task: afrixnli_en_direct_zul
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py
new file mode 100644
index 00000000..c4beaf8e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py	
@@ -0,0 +1,18 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_target(doc):
+    replacements = {
+        0: 'True',
+        1: 'Neither',
+        2: 'False'
+    }
+    return replacements[doc["label"]]
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml
new file mode 100644
index 00000000..87b51790
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_choice: '{{[premise+", ትክክል? አዎ, "+hypothesis,premise+", ትክክል? እንዲሁም, "+hypothesis,premise+",
+  ትክክል? አይ, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_amh
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml
new file mode 100644
index 00000000..bb2dcd58
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_choice: '{{[premise+", Right? Yes, "+hypothesis,premise+", Right? Also, "+hypothesis,premise+",
+  Right? No, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_eng
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml
new file mode 100644
index 00000000..ecdc41c5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_choice: '{{[premise+", Esɔ gbe? Ɛ̃, "+hypothesis,premise+", Esɔ gbe? Hã, "+hypothesis,premise+",
+  Esɔ gbe? Ao, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_ewe
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml
new file mode 100644
index 00000000..1e6e32cc
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_choice: '{{[premise+", correct? Oui, "+hypothesis,premise+", correct? Aussi,
+  "+hypothesis,premise+", correct? Non, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_fra
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml
new file mode 100644
index 00000000..b243a5de
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_choice: '{{[premise+", Daidai? Ee, "+hypothesis,premise+", Daidai? Haka kuma,
+  "+hypothesis,premise+", Daidai? A''a, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_hau
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml
new file mode 100644
index 00000000..597ffb64
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_choice: '{{[premise+", Ziri ezi? Éè, "+hypothesis,premise+", Ziri ezi? Ọzọkwa,
+  "+hypothesis,premise+", Ziri ezi? Mba, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_ibo
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml
new file mode 100644
index 00000000..3333c120
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_choice: '{{[premise+", Nibyo? Yego, "+hypothesis,premise+", Nibyo? Na none,
+  "+hypothesis,premise+", Nibyo? Oya, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_kin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml
new file mode 100644
index 00000000..95060d68
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_choice: '{{[premise+", Malamu? Iyo, "+hypothesis,premise+", Malamu? Lisusu,
+  "+hypothesis,premise+", Malamu? Te, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_lin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml
new file mode 100644
index 00000000..97b6d00e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_choice: '{{[premise+", Kituufu? Yee, "+hypothesis,premise+", Kituufu? N’ekirala,
+  "+hypothesis,premise+", Kituufu? Nedda, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_lug
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml
new file mode 100644
index 00000000..f9c25496
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_choice: '{{[premise+", Sirrii? Eeyyee, "+hypothesis,premise+", Sirrii? Akkasumas,
+  "+hypothesis,premise+", Sirrii? Lakki, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_orm
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml
new file mode 100644
index 00000000..be2b2617
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_choice: '{{[premise+", Chokwadi? Hongu, "+hypothesis,premise+", Chokwadi? Uye,
+  "+hypothesis,premise+", Chokwadi? Kwete, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_sna
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml
new file mode 100644
index 00000000..092961e0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_choice: '{{[premise+", Nepile? E, "+hypothesis,premise+", Nepile? Hape, "+hypothesis,premise+",
+  Nepile? Tjhe, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_sot
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml
new file mode 100644
index 00000000..c8b1e2af
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_choice: '{{[premise+", Sahihi? Ndiyo, "+hypothesis,premise+", Sahihi? Pia,
+  "+hypothesis,premise+", Sahihi? Hapana, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_swa
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml
new file mode 100644
index 00000000..4d3141d6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_choice: '{{[premise+", Nifa? Aane, "+hypothesis,premise+", Nifa? Anaasɛ, "+hypothesis,premise+",
+  Nifa? Daabi, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_twi
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml
new file mode 100644
index 00000000..1239fa47
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_choice: '{{[premise+", Dëgg? Waaw, "+hypothesis,premise+", Dëgg? Itam, "+hypothesis,premise+",
+  Dëgg? Déet, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_wol
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml
new file mode 100644
index 00000000..f6f91f6e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_choice: '{{[premise+", Ichanekile? Ewe, "+hypothesis,premise+", Ichanekile?
+  Kananjalo, "+hypothesis,premise+", Ichanekile? Hayi, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_xho
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yaml
new file mode 100644
index 00000000..560a6515
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yaml	
@@ -0,0 +1,26 @@
+group:
+    - xnli
+    - afrixnli
+    - afrixnli_native_direct
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_text: ""
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml
new file mode 100644
index 00000000..2648bf57
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_choice: '{{[premise+", Òótọ́? Bẹ́ẹ̀ni, "+hypothesis,premise+", Òótọ́? Àti pé,
+  "+hypothesis,premise+", Òótọ́? Rárá, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_yor
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml
new file mode 100644
index 00000000..48261c60
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml	
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_choice: '{{[premise+", Kulungile? Yebo, "+hypothesis,premise+", Kulungile?
+  Futhi, "+hypothesis,premise+", Kulungile? Cha, "+hypothesis]}}'
+include: afrixnli_native_direct_yaml
+task: afrixnli_native_direct_zul
diff --git a/lm_eval/tasks/afrixnli/anli prompt/native-direct/utils.py b/lm_eval/tasks/afrixnli/anli prompt/native-direct/utils.py
new file mode 100644
index 00000000..814b96b8
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/native-direct/utils.py	
@@ -0,0 +1,9 @@
+from sklearn.metrics import f1_score
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml
new file mode 100644
index 00000000..785ffe99
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml	
@@ -0,0 +1,5 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh
+
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml
new file mode 100644
index 00000000..55d5b470
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml
new file mode 100644
index 00000000..bd590335
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml
new file mode 100644
index 00000000..ddc7a375
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml
new file mode 100644
index 00000000..2487f15a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml
new file mode 100644
index 00000000..ebae340f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml
new file mode 100644
index 00000000..0ad2ea07
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml
new file mode 100644
index 00000000..c9ab9182
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml
new file mode 100644
index 00000000..87922812
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml
new file mode 100644
index 00000000..69756c26
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml
new file mode 100644
index 00000000..64b5cb29
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml
new file mode 100644
index 00000000..ea630713
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml
new file mode 100644
index 00000000..5cfd32e2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml
new file mode 100644
index 00000000..be1188e5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml
new file mode 100644
index 00000000..428ff3bb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yaml
new file mode 100644
index 00000000..5cbe3bae
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yaml	
@@ -0,0 +1,33 @@
+group:
+    - xnli
+    - afrixnli
+    - afrixnli_translate
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml
new file mode 100644
index 00000000..4f07a41a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml
new file mode 100644
index 00000000..7a57632b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py
new file mode 100644
index 00000000..c4beaf8e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py	
@@ -0,0 +1,18 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_target(doc):
+    replacements = {
+        0: 'True',
+        1: 'Neither',
+        2: 'False'
+    }
+    return replacements[doc["label"]]
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml
new file mode 100644
index 00000000..9e8974c9
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_amh
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml
new file mode 100644
index 00000000..7936a432
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_eng
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml
new file mode 100644
index 00000000..fe2fce97
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_ewe
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml
new file mode 100644
index 00000000..07c2f662
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_fra
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml
new file mode 100644
index 00000000..885e571b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_hau
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml
new file mode 100644
index 00000000..7a842826
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_ibo
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml
new file mode 100644
index 00000000..611f61df
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_kin
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml
new file mode 100644
index 00000000..eed83c75
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_lin
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml
new file mode 100644
index 00000000..9c3cc024
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_lug
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml
new file mode 100644
index 00000000..4931dc0a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_orm
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml
new file mode 100644
index 00000000..ad7660a0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_sna
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml
new file mode 100644
index 00000000..266605cb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_sot
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml
new file mode 100644
index 00000000..07a89092
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_swa
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml
new file mode 100644
index 00000000..d76fff81
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_twi
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml
new file mode 100644
index 00000000..9f189d39
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_wol
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml
new file mode 100644
index 00000000..307b42fc
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_xho
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yaml
new file mode 100644
index 00000000..d8b4b2fd
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yaml	
@@ -0,0 +1,32 @@
+group:
+    - xnli
+    - afrixnli
+    - afrixnli_manual_direct
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml
new file mode 100644
index 00000000..b2b9f99a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_yor
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml
new file mode 100644
index 00000000..28338406
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_manual_direct_yaml
+task: afrixnli_manual_direct_zul
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py
new file mode 100644
index 00000000..e8e3fb58
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py	
@@ -0,0 +1,32 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise 
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+    
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+    
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc['premise'],
+                         hypothesis=doc['hypothesis'])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {
+        0: 'entailment',
+        1: 'neutral',
+        2: 'contradiction'
+    }
+    return replacements[doc["label"]]
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml
new file mode 100644
index 00000000..aa79494a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_amh
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml
new file mode 100644
index 00000000..9d209484
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_ewe
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml
new file mode 100644
index 00000000..a943963b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_fra
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml
new file mode 100644
index 00000000..a343c952
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_hau
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml
new file mode 100644
index 00000000..0057e6b0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_ibo
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml
new file mode 100644
index 00000000..5874ae5f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_kin
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml
new file mode 100644
index 00000000..a6e6023c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_lin
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml
new file mode 100644
index 00000000..5bc3a14d
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_lug
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml
new file mode 100644
index 00000000..cb9a494f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_orm
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml
new file mode 100644
index 00000000..d6523987
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_sna
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml
new file mode 100644
index 00000000..319e909c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_sot
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml
new file mode 100644
index 00000000..a34eb438
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_swa
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml
new file mode 100644
index 00000000..0535f1db
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_twi
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml
new file mode 100644
index 00000000..8541b73e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_wol
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml
new file mode 100644
index 00000000..8d1eebcb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_xho
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yaml
new file mode 100644
index 00000000..94fd16ed
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yaml	
@@ -0,0 +1,30 @@
+group:
+    - xnli
+    - afrixnli
+    - afrixnli_manual_direct
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml
new file mode 100644
index 00000000..1825ec27
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_yor
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml
new file mode 100644
index 00000000..4bf52549
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml	
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_manual_translate_yaml
+task: afrixnli_manual_translate_zul
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py
new file mode 100644
index 00000000..e8e3fb58
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py	
@@ -0,0 +1,32 @@
+from sklearn.metrics import f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise 
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+    
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+    
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc['premise'],
+                         hypothesis=doc['hypothesis'])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {
+        0: 'entailment',
+        1: 'neutral',
+        2: 'contradiction'
+    }
+    return replacements[doc["label"]]
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/utils.py b/lm_eval/tasks/afrixnli/utils.py
new file mode 100644
index 00000000..088f8fe6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/utils.py
@@ -0,0 +1,212 @@
+import yaml
+import argparse
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+LANGUAGES = {
+    "amh": {
+        "QUESTION_WORD": "ትክክል",
+        "ENTAILMENT_LABEL": "አዎ",
+        "NEUTRAL_LABEL": "እንዲሁም",
+        "CONTRADICTION_LABEL": "አይ"
+    },
+    "eng": {
+        "QUESTION_WORD": "Right",
+        "ENTAILMENT_LABEL": "Yes",
+        "NEUTRAL_LABEL": "Also",
+        "CONTRADICTION_LABEL": "No"
+    },
+    "ewe": {
+        "QUESTION_WORD": "Esɔ gbe",
+        "ENTAILMENT_LABEL": "Ɛ̃",
+        "NEUTRAL_LABEL": "Hã",
+        "CONTRADICTION_LABEL": "Ao"
+    },
+    "fra": {
+        "QUESTION_WORD": "correct",
+        "ENTAILMENT_LABEL": "Oui",
+        "NEUTRAL_LABEL": "Aussi",
+        "CONTRADICTION_LABEL": "Non"
+    },
+    "hau": {
+        "QUESTION_WORD": "Daidai",
+        "ENTAILMENT_LABEL": "Ee",
+        "NEUTRAL_LABEL": "Haka kuma",
+        "CONTRADICTION_LABEL": "A'a"
+    },
+    "ibo": {
+        "QUESTION_WORD": "Ziri ezi",
+        "ENTAILMENT_LABEL": "Éè",
+        "NEUTRAL_LABEL": "Ọzọkwa",
+        "CONTRADICTION_LABEL": "Mba"
+    },
+    "kin": {
+        "QUESTION_WORD": "Nibyo",
+        "ENTAILMENT_LABEL": "Yego",
+        "NEUTRAL_LABEL": "Na none",
+        "CONTRADICTION_LABEL": "Oya"
+    },
+    "lin": {
+        "QUESTION_WORD": "Malamu",
+        "ENTAILMENT_LABEL": "Iyo",
+        "NEUTRAL_LABEL": "Lisusu",
+        "CONTRADICTION_LABEL": "Te"
+    },
+    "lug": {
+        "QUESTION_WORD": "Kituufu",
+        "ENTAILMENT_LABEL": "Yee",
+        "NEUTRAL_LABEL": "N’ekirala",
+        "CONTRADICTION_LABEL": "Nedda"
+    },
+    "orm": {
+        "QUESTION_WORD": "Sirrii",
+        "ENTAILMENT_LABEL": "Eeyyee",
+        "NEUTRAL_LABEL": "Akkasumas",
+        "CONTRADICTION_LABEL": "Lakki"
+    },
+    "sna": {
+        "QUESTION_WORD": "Chokwadi",
+        "ENTAILMENT_LABEL": "Hongu",
+        "NEUTRAL_LABEL": "Uye",
+        "CONTRADICTION_LABEL": "Kwete"
+    },
+    "sot": {
+        "QUESTION_WORD": "Nepile",
+        "ENTAILMENT_LABEL": "E",
+        "NEUTRAL_LABEL": "Hape",
+        "CONTRADICTION_LABEL": "Tjhe"
+    },
+    "swa": {
+        "QUESTION_WORD": "Sahihi",
+        "ENTAILMENT_LABEL": "Ndiyo",
+        "NEUTRAL_LABEL": "Pia",
+        "CONTRADICTION_LABEL": "Hapana"
+    },
+    "twi": {
+        "QUESTION_WORD": "Nifa",
+        "ENTAILMENT_LABEL": "Aane",
+        "NEUTRAL_LABEL": "Anaasɛ",
+        "CONTRADICTION_LABEL": "Daabi"
+    },
+    "wol": {
+        "QUESTION_WORD": "Dëgg",
+        "ENTAILMENT_LABEL": "Waaw",
+        "NEUTRAL_LABEL": "Itam",
+        "CONTRADICTION_LABEL": "Déet"
+    },
+    "xho": {
+        "QUESTION_WORD": "Ichanekile",
+        "ENTAILMENT_LABEL": "Ewe",
+        "NEUTRAL_LABEL": "Kananjalo",
+        "CONTRADICTION_LABEL": "Hayi"
+    },
+    "yor": {
+        "QUESTION_WORD": "Òótọ́",
+        "ENTAILMENT_LABEL": "Bẹ́ẹ̀ni",
+        "NEUTRAL_LABEL": "Àti pé",
+        "CONTRADICTION_LABEL": "Rárá"
+    },
+    "zul": {
+        "QUESTION_WORD": "Kulungile",
+        "ENTAILMENT_LABEL": "Yebo",
+        "NEUTRAL_LABEL": "Futhi",
+        "CONTRADICTION_LABEL": "Cha"
+    }
+}
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'wol', 'ewe', 'lin', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm',
+                 'yor', 'hau', 'sot', 'swa']
+    for lang in languages:
+        try:
+            if mode == "native-direct":
+                QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
+                ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
+                NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
+                CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
+
+                file_name = f"afrixnli_native_direct_{lang}.yaml"
+                task_name = f"afrixnli_native_direct_{lang}"
+                yaml_template = "afrixnli_native_direct_yaml"
+                with open(
+                        f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": yaml_template,
+                            "task": task_name,
+                            "dataset_name": lang,
+                            "doc_to_choice": f"{{{{["
+                                           f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
+                                           f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
+                                           f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
+                                           f"]}}}}",
+                        },
+                        f,
+                        allow_unicode=True,
+                    )
+            else:
+                file_name = f"afrixnli_{mode}_{lang}.yaml"
+                task_name = f"afrixnli_{mode}_{lang}"
+                yaml_template = f"afrixnli_{mode}_yaml"
+                with open(
+                        f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": yaml_template,
+                            "task": task_name,
+                            "dataset_name": lang
+                        },
+                        f,
+                        allow_unicode=True,
+                    )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default="./manual/translate", help="Directory to write yaml files to"
+    )
+    parser.add_argument(
+        "--mode",
+        default="manual_translate",
+        choices=["en_direct", "native-direct", "manual_direct", "manual_translate"],
+        help="Mode of chain-of-thought",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From a7a2923fc39bb7b737d279214e7a935b64500d87 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Sat, 13 Jul 2024 16:22:03 -0500
Subject: [PATCH 35/45] docs: remove trailing sentence from contribution doc
 (#2098)

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 docs/CONTRIBUTING.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 8b9ef616..bec68747 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -2,8 +2,6 @@
 
 Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
 
-We intend LM Evaluation Harness to be a broadly useful and
-
 ## Important Resources
 
 There are several places information about LM Evaluation Harness is located:
-- 
GitLab


From 2b26690fe5d06551030031116d073656c8dc0f33 Mon Sep 17 00:00:00 2001
From: Ben Shoham Ofir <33639234+Ofir408@users.noreply.github.com>
Date: Sun, 14 Jul 2024 23:37:20 +0300
Subject: [PATCH 36/45] Added MedConceptsQA Benchmark (#2010)

* Added MedConceptsQA Benchmark

* pre-commit factor

* update group name

* update in naming

* changed name

* Changed mcqa to med_concepts_qa prefix

* Added med_concepts_qa to README.md

* Changed config files according the new format

* Updated README

---------

Co-authored-by: lintangsutawika <lintang@eleuther.ai>
---
 lm_eval/tasks/README.md                       |  1 +
 lm_eval/tasks/med_concepts_qa/README.md       | 49 +++++++++++++++++++
 .../med_concepts_qa/_default_template_yaml    | 15 ++++++
 .../med_concepts_qa/_generate_configs.py      | 34 +++++++++++++
 .../med_concepts_qa/_med_concepts_qa.yaml     | 10 ++++
 .../med_concepts_qa/_med_concepts_qa_atc.yaml |  6 +++
 .../_med_concepts_qa_icd10cm.yaml             |  6 +++
 .../_med_concepts_qa_icd10proc.yaml           |  6 +++
 .../_med_concepts_qa_icd9cm.yaml              |  6 +++
 .../_med_concepts_qa_icd9proc.yaml            |  6 +++
 .../med_concepts_qa_atc_easy.yaml             |  5 ++
 .../med_concepts_qa_atc_hard.yaml             |  5 ++
 .../med_concepts_qa_atc_medium.yaml           |  5 ++
 .../med_concepts_qa_icd10cm_easy.yaml         |  5 ++
 .../med_concepts_qa_icd10cm_hard.yaml         |  5 ++
 .../med_concepts_qa_icd10cm_medium.yaml       |  5 ++
 .../med_concepts_qa_icd10proc_easy.yaml       |  5 ++
 .../med_concepts_qa_icd10proc_hard.yaml       |  5 ++
 .../med_concepts_qa_icd10proc_medium.yaml     |  5 ++
 .../med_concepts_qa_icd9cm_easy.yaml          |  5 ++
 .../med_concepts_qa_icd9cm_hard.yaml          |  5 ++
 .../med_concepts_qa_icd9cm_medium.yaml        |  5 ++
 .../med_concepts_qa_icd9proc_easy.yaml        |  5 ++
 .../med_concepts_qa_icd9proc_hard.yaml        |  5 ++
 .../med_concepts_qa_icd9proc_medium.yaml      |  5 ++
 25 files changed, 214 insertions(+)
 create mode 100644 lm_eval/tasks/med_concepts_qa/README.md
 create mode 100644 lm_eval/tasks/med_concepts_qa/_default_template_yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_generate_configs.py
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml
 create mode 100644 lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index c6e4c4d7..b088bddd 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -62,6 +62,7 @@
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
 | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
 | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
 | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
diff --git a/lm_eval/tasks/med_concepts_qa/README.md b/lm_eval/tasks/med_concepts_qa/README.md
new file mode 100644
index 00000000..666d6446
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/README.md
@@ -0,0 +1,49 @@
+# MedConceptsQA
+
+### Paper
+
+Title: `MedConceptsQA: Open Source Medical Concepts QA Benchmark`
+
+Abstract: https://arxiv.org/abs/2405.07348
+
+MedConceptsQA is a dedicated open source benchmark for medical concepts question answering. The benchmark comprises of questions of various medical concepts across different vocabularies: diagnoses, procedures, and drugs.
+
+The questions are categorized into three levels of difficulty: easy, medium, and hard.
+
+Our benchmark serves as a valuable resource for evaluating the
+abilities of Large Language Models to interpret medical codes and distinguish
+between medical concepts.
+
+### Citation
+
+```
+@article{shoham2024medconceptsqa,
+  title={MedConceptsQA--Open Source Medical Concepts QA Benchmark},
+  author={Shoham, Ofir Ben and Rappoport, Nadav},
+  journal={arXiv preprint arXiv:2405.07348},
+  year={2024}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `med_concepts_qa`: Contains all the QA tasks (diagnosis, procedures ,and drugs).
+
+#### Tasks
+
+
+* `med_concepts_qa_icd9cm` - ICD9-CM (diagnosis codes, ICD9 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-9-CM (International Classification of Diseases, 9th Revision, Clinical Modification) diagnosis codes.
+
+
+* `med_concepts_qa_icd10cm` - ICD10-CM (diagnosis codes, ICD10 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-10-CM (International Classification of Diseases, 10th Revision, Clinical Modification) diagnosis codes.
+
+
+* `med_concepts_qa_icd9proc` - ICD9-Proc (procedure codes, ICD9 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-9-PCS (International Classification of Diseases, 9th Revision, Procedure Coding System) procedure codes.
+
+
+* `med_concepts_qa_icd10proc` - ICD10-Proc (procedure codes, ICD10 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-10-PCS (International Classification of Diseases, 10th Revision, Procedure Coding System) procedure codes.
+
+
+* `med_concepts_qa_atc` - ATC (Anatomical Therapeutic Chemical Classification System) question-answering. This involves providing information, clarifications, and answering questions related to the ATC classification system, which is used for the classification of drugs and other medical products according to the organ or system on which they act and their therapeutic, pharmacological, and chemical properties.
diff --git a/lm_eval/tasks/med_concepts_qa/_default_template_yaml b/lm_eval/tasks/med_concepts_qa/_default_template_yaml
new file mode 100644
index 00000000..84198fa7
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_default_template_yaml
@@ -0,0 +1,15 @@
+dataset_path: ofir408/MedConceptsQA
+output_type: multiple_choice
+description: "Answer A,B,C,D according to the answer to this multiple choice question.\n"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+num_fewshot: 4
+test_split: test
+doc_to_text: "{{question}}\nAnswer:"
+doc_to_target: answer_id
+doc_to_choice: ['A', 'B', 'C', 'D']
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/med_concepts_qa/_generate_configs.py b/lm_eval/tasks/med_concepts_qa/_generate_configs.py
new file mode 100644
index 00000000..fd81edbb
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_generate_configs.py
@@ -0,0 +1,34 @@
+from typing import List
+
+import yaml
+
+
+def generate_yaml_content(vocab_name: str, level: str):
+    content = {
+        "dataset_name": f"{vocab_name}_{level}",
+        "tag": f"med_concepts_qa_{vocab_name}_tasks",
+        "include": "_default_template_yaml",
+        "task": f"med_concepts_qa_{vocab_name}_{level}",
+        "task_alias": f"{vocab_name}_{level}",
+    }
+    return content
+
+
+def generate_yaml_files(
+    vocab_names: List[str], levels: List[str], file_name_prefix: str
+):
+    for vocab_name in vocab_names:
+        for level in levels:
+            yaml_content = generate_yaml_content(vocab_name, level)
+            filename = f"{file_name_prefix}_{vocab_name}_{level}.yaml"
+            with open(filename, "w") as yaml_file:
+                yaml.dump(yaml_content, yaml_file, default_flow_style=False)
+            print(f"Done to generated {filename}")
+
+
+if __name__ == "__main__":
+    generate_yaml_files(
+        vocab_names=["icd9cm", "icd10cm", "icd9proc", "icd10proc", "atc"],
+        levels=["easy", "medium", "hard"],
+        file_name_prefix="med_concepts_qa",
+    )
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml
new file mode 100644
index 00000000..c737127f
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml
@@ -0,0 +1,10 @@
+group: med_concepts_qa
+task:
+  - med_concepts_qa_icd9cm
+  - med_concepts_qa_icd10cm
+  - med_concepts_qa_icd9proc
+  - med_concepts_qa_icd10proc
+  - med_concepts_qa_atc
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
new file mode 100644
index 00000000..b457ae58
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
@@ -0,0 +1,6 @@
+group: med_concepts_qa_atc
+task:
+  - med_concepts_qa_atc_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
\ No newline at end of file
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml
new file mode 100644
index 00000000..c4afc1bd
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml
@@ -0,0 +1,6 @@
+group: med_concepts_qa_icd10cm
+task:
+  - med_concepts_qa_icd10cm_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
new file mode 100644
index 00000000..777a5ce6
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
@@ -0,0 +1,6 @@
+group: med_concepts_qa_icd10proc
+task:
+  - med_concepts_qa_icd10proc_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
\ No newline at end of file
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
new file mode 100644
index 00000000..d5671bf0
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
@@ -0,0 +1,6 @@
+group: med_concepts_qa_icd9cm
+task:
+  - med_concepts_qa_icd9cm_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
\ No newline at end of file
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
new file mode 100644
index 00000000..4487772f
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
@@ -0,0 +1,6 @@
+group: med_concepts_qa_icd9proc
+task:
+  - med_concepts_qa_icd9proc_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
\ No newline at end of file
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml
new file mode 100644
index 00000000..be2f62a1
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml
@@ -0,0 +1,5 @@
+dataset_name: atc_easy
+include: _default_template_yaml
+tag: med_concepts_qa_atc_tasks
+task: med_concepts_qa_atc_easy
+task_alias: atc_easy
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml
new file mode 100644
index 00000000..a54d27d9
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml
@@ -0,0 +1,5 @@
+dataset_name: atc_hard
+include: _default_template_yaml
+tag: med_concepts_qa_atc_tasks
+task: med_concepts_qa_atc_hard
+task_alias: atc_hard
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml
new file mode 100644
index 00000000..e0b64ac8
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml
@@ -0,0 +1,5 @@
+dataset_name: atc_medium
+include: _default_template_yaml
+tag: med_concepts_qa_atc_tasks
+task: med_concepts_qa_atc_medium
+task_alias: atc_medium
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml
new file mode 100644
index 00000000..0fe8da7d
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10cm_easy
+include: _default_template_yaml
+tag: med_concepts_qa_icd10cm_tasks
+task: med_concepts_qa_icd10cm_easy
+task_alias: icd10cm_easy
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml
new file mode 100644
index 00000000..2b291b0e
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10cm_hard
+include: _default_template_yaml
+tag: med_concepts_qa_icd10cm_tasks
+task: med_concepts_qa_icd10cm_hard
+task_alias: icd10cm_hard
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml
new file mode 100644
index 00000000..b1678fa3
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10cm_medium
+include: _default_template_yaml
+tag: med_concepts_qa_icd10cm_tasks
+task: med_concepts_qa_icd10cm_medium
+task_alias: icd10cm_medium
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml
new file mode 100644
index 00000000..301f559d
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10proc_easy
+include: _default_template_yaml
+tag: med_concepts_qa_icd10proc_tasks
+task: med_concepts_qa_icd10proc_easy
+task_alias: icd10proc_easy
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml
new file mode 100644
index 00000000..15efafba
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10proc_hard
+include: _default_template_yaml
+tag: med_concepts_qa_icd10proc_tasks
+task: med_concepts_qa_icd10proc_hard
+task_alias: icd10proc_hard
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml
new file mode 100644
index 00000000..648bd460
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd10proc_medium
+include: _default_template_yaml
+tag: med_concepts_qa_icd10proc_tasks
+task: med_concepts_qa_icd10proc_medium
+task_alias: icd10proc_medium
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml
new file mode 100644
index 00000000..514a9e25
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9cm_easy
+include: _default_template_yaml
+tag: med_concepts_qa_icd9cm_tasks
+task: med_concepts_qa_icd9cm_easy
+task_alias: icd9cm_easy
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml
new file mode 100644
index 00000000..41d0fd55
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9cm_hard
+include: _default_template_yaml
+tag: med_concepts_qa_icd9cm_tasks
+task: med_concepts_qa_icd9cm_hard
+task_alias: icd9cm_hard
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml
new file mode 100644
index 00000000..90f93f9b
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9cm_medium
+include: _default_template_yaml
+tag: med_concepts_qa_icd9cm_tasks
+task: med_concepts_qa_icd9cm_medium
+task_alias: icd9cm_medium
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml
new file mode 100644
index 00000000..459fa464
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9proc_easy
+include: _default_template_yaml
+tag: med_concepts_qa_icd9proc_tasks
+task: med_concepts_qa_icd9proc_easy
+task_alias: icd9proc_easy
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml
new file mode 100644
index 00000000..5424623c
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9proc_hard
+include: _default_template_yaml
+tag: med_concepts_qa_icd9proc_tasks
+task: med_concepts_qa_icd9proc_hard
+task_alias: icd9proc_hard
diff --git a/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml
new file mode 100644
index 00000000..84302920
--- /dev/null
+++ b/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml
@@ -0,0 +1,5 @@
+dataset_name: icd9proc_medium
+include: _default_template_yaml
+tag: med_concepts_qa_icd9proc_tasks
+task: med_concepts_qa_icd9proc_medium
+task_alias: icd9proc_medium
-- 
GitLab


From 9884ad6ecea88b2f1ed5e56c93f060e2266da160 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:23:53 -0400
Subject: [PATCH 37/45] make recurrent_gemma model types included in the
 force-BOS case (#2105)

---
 lm_eval/models/huggingface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 331684f5..cf45be9d 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -257,10 +257,10 @@ class HFLM(TemplateLM):
         self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
 
         self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
+        if "gemma" in getattr(self.config, "model_type", ""):
             self.add_bos_token = True
             eval_logger.info(
-                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
             )
 
         self._max_length = max_length
-- 
GitLab


From 56a4e7943fca2959aff06bb96a3fe5ec63255aaa Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@eleuther.ai>
Date: Mon, 15 Jul 2024 21:24:49 +0700
Subject: [PATCH 38/45] formatting (#2104)

---
 lm_eval/api/metrics.py                        |   2 +-
 lm_eval/api/task.py                           |   2 +-
 lm_eval/filters/extraction.py                 |   2 +-
 lm_eval/tasks/afrimgsm/README.md              |   8 +-
 lm_eval/tasks/afrimgsm/run.sh                 |   2 +-
 lm_eval/tasks/afrimgsm/utils.py               | 106 +++++++++++-------
 lm_eval/tasks/afrimmlu/README.md              |   8 +-
 .../afrimmlu/direct/afrimmlu_common_yaml      |  12 +-
 .../afrimmlu/direct/afrimmlu_direct_eng.yaml  |   1 -
 .../afrimmlu/direct/afrimmlu_direct_ewe.yaml  |   1 -
 .../afrimmlu/direct/afrimmlu_direct_fra.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_hau.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_ibo.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_kin.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_lin.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_lug.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_orm.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_sna.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_sot.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_swa.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_twi.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_wol.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_xho.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_yor.yaml  |   2 +-
 .../afrimmlu/direct/afrimmlu_direct_zul.yaml  |   2 +-
 lm_eval/tasks/afrimmlu/direct/utils.py        |  29 +++--
 lm_eval/tasks/afrimmlu/fewshot.sh             |   2 +-
 .../translate/afrimmlu_common_translate_yaml  |  12 +-
 .../translate/afrimmlu_translate_eng.yaml     |   1 -
 .../translate/afrimmlu_translate_ewe.yaml     |   1 -
 .../translate/afrimmlu_translate_fra.yaml     |   2 +-
 .../translate/afrimmlu_translate_hau.yaml     |   2 +-
 .../translate/afrimmlu_translate_ibo.yaml     |   2 +-
 .../translate/afrimmlu_translate_kin.yaml     |   2 +-
 .../translate/afrimmlu_translate_lin.yaml     |   2 +-
 .../translate/afrimmlu_translate_lug.yaml     |   2 +-
 .../translate/afrimmlu_translate_orm.yaml     |   2 +-
 .../translate/afrimmlu_translate_sna.yaml     |   2 +-
 .../translate/afrimmlu_translate_sot.yaml     |   2 +-
 .../translate/afrimmlu_translate_swa.yaml     |   2 +-
 .../translate/afrimmlu_translate_twi.yaml     |   2 +-
 .../translate/afrimmlu_translate_wol.yaml     |   2 +-
 .../translate/afrimmlu_translate_xho.yaml     |   2 +-
 .../translate/afrimmlu_translate_yor.yaml     |   2 +-
 .../translate/afrimmlu_translate_zul.yaml     |   2 +-
 lm_eval/tasks/afrimmlu/translate/utils.py     |  24 ++--
 lm_eval/tasks/afrimmlu/utils.py               |  24 ++--
 lm_eval/tasks/afrixnli/README.md              |  10 +-
 .../afrixnli/anli prompt/en-direct/utils.py   |   6 +-
 .../translate/afrixnli_translate_amh.yaml     |   1 -
 .../afrixnli/anli prompt/translate/utils.py   |   6 +-
 .../tasks/afrixnli/lai prompt/direct/utils.py |  15 +--
 .../afrixnli/lai prompt/translate/utils.py    |  15 +--
 lm_eval/tasks/afrixnli/utils.py               |  85 +++++++++-----
 .../med_concepts_qa/_med_concepts_qa_atc.yaml |   2 +-
 .../_med_concepts_qa_icd10proc.yaml           |   2 +-
 .../_med_concepts_qa_icd9cm.yaml              |   2 +-
 .../_med_concepts_qa_icd9proc.yaml            |   2 +-
 58 files changed, 234 insertions(+), 209 deletions(-)

diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 5ea6b221..1daf3847 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
 
     assert len(metrics) == len(sizes)
 
-    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
\ No newline at end of file
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index ec9caccf..030e6857 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1665,4 +1665,4 @@ class PerplexityTask(Task):
     @classmethod
     def count_words(cls, doc) -> int:
         """Downstream tasks with custom word boundaries should override this!"""
-        return len(re.split(r"\s+", doc))
\ No newline at end of file
+        return len(re.split(r"\s+", doc))
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index 4f087451..41dc6208 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -181,4 +181,4 @@ class MultiChoiceRegexFilter(RegexFilter):
                 filtered.append(match)
             filtered_resps.append(filtered)
 
-        return filtered_resps
\ No newline at end of file
+        return filtered_resps
diff --git a/lm_eval/tasks/afrimgsm/README.md b/lm_eval/tasks/afrimgsm/README.md
index 8f9d4619..cca14d96 100644
--- a/lm_eval/tasks/afrimgsm/README.md
+++ b/lm_eval/tasks/afrimgsm/README.md
@@ -5,8 +5,8 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
 
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
 
 
@@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
       author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
       year={2024},
       eprint={2406.03368},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```
 
diff --git a/lm_eval/tasks/afrimgsm/run.sh b/lm_eval/tasks/afrimgsm/run.sh
index 370ffd9a..075500be 100644
--- a/lm_eval/tasks/afrimgsm/run.sh
+++ b/lm_eval/tasks/afrimgsm/run.sh
@@ -3,4 +3,4 @@ lm_eval --model hf   \
         --device cuda:0     \
         --batch_size 1  \
         --verbosity DEBUG \
-        --limit 5
\ No newline at end of file
+        --limit 5
diff --git a/lm_eval/tasks/afrimgsm/utils.py b/lm_eval/tasks/afrimgsm/utils.py
index 8899e886..0dd336f8 100644
--- a/lm_eval/tasks/afrimgsm/utils.py
+++ b/lm_eval/tasks/afrimgsm/utils.py
@@ -2,51 +2,74 @@ import argparse
 
 import yaml
 
-languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor',
-             'hau', 'sot', 'swa']
-
-languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)",
-                   "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
-                   "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
-                   'fra': "La réponse est(\\-?[0-9\\.\\,]+)",
-                   'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
-                   'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)",
-                   'wol': "Tontu li (\\-?[0-9\\.\\,]+)",
-                   'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
-                   'lug': "Ansa eri (\\-?[0-9\\.\\,]+)",
-                   'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)",
-                   'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)",
-                   'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
-                   'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)",
-                   'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)",
-                   'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
-                   'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)",
-                   'sot': "Karabo ke (\\-?[0-9\\.\\,]+)",
-                   'swa': "Jibu ni (\\-?[0-9\\.\\,]+)",
-                   }
+
+languages = [
+    "eng",
+    "amh",
+    "ibo",
+    "fra",
+    "sna",
+    "lin",
+    "wol",
+    "ewe",
+    "lug",
+    "xho",
+    "kin",
+    "twi",
+    "zul",
+    "orm",
+    "yor",
+    "hau",
+    "sot",
+    "swa",
+]
+
+languages_REGEX = {
+    "eng": "The answer is (\\-?[0-9\\.\\,]+)",
+    "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+    "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+    "fra": "La réponse est(\\-?[0-9\\.\\,]+)",
+    "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+    "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)",
+    "wol": "Tontu li (\\-?[0-9\\.\\,]+)",
+    "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+    "lug": "Ansa eri (\\-?[0-9\\.\\,]+)",
+    "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+    "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+    "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+    "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+    "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+    "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+    "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+    "sot": "Karabo ke (\\-?[0-9\\.\\,]+)",
+    "swa": "Jibu ni (\\-?[0-9\\.\\,]+)",
+}
 
 LANGUAGES = {}
 
 for lang in languages:
-    if lang == 'amh':
+    if lang == "amh":
         LANGUAGES[lang] = {  # English
             "QUESTION": "ጥያቄ:",
             "ANSWER": "በቅደም ተከተል መልስ:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
-    elif lang == 'yor':
+            "REGEX": languages_REGEX[lang],
+        }
+    elif lang == "yor":
         LANGUAGES[lang] = {  # English
             "QUESTION": "Ìbéèrè:",
             "ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
 
     else:
         LANGUAGES[lang] = {  # English
             "QUESTION": "Question:",
             "ANSWER": "Step-by-Step Answer:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
 
 
 def add_regex_pattern(regex_pattern):
@@ -93,13 +116,12 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
     err = []
     for lang in LANGUAGES.keys():
         try:
-
             yaml_template = "cot_yaml"
             filter_list = {}
             DELIMITER = None
             if mode == "direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                 REGEX = None
                 task_name = f"afrimgsm_direct_{lang}"
                 yaml_template = "direct_yaml"
@@ -122,8 +144,8 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                 QUESTION = LANGUAGES["eng"]["QUESTION"]
                 task_name = f"afrimgsm_en_cot_{lang}"
             elif mode == "translate-direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                 REGEX = None
                 task_name = f"afrimgsm_translate_direct_{lang}"
                 yaml_template = "translate_direct_yaml"
@@ -131,7 +153,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
             file_name = f"{task_name}.yaml"
             ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
             with open(
-                    f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
             ) as f:
                 f.write("# Generated by utils.py\n")
                 yaml.dump(
@@ -140,15 +162,15 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                         "dataset_name": lang,
                         "task": f"{task_name}",
                         "doc_to_text": f"""{{% if answer is not none %}}"""
-                                       f"""{{{{question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% else %}}"""
-                                       f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% endif %}}""",
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% endif %}}""",
                         "doc_to_target": f"""{{% if answer is not none %}}"""
-                                         f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
-                                         f"""{{% else %}}"""
-                                         f"""{{{{answer_number|string}}}}"""
-                                         f"""{{% endif %}}""",
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{answer_number|string}}}}"""
+                        f"""{{% endif %}}""",
                         **filter_list,
                         "generation_kwargs": {
                             "until": [QUESTION, "</s>", "<|im_end|>"],
@@ -194,4 +216,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/lm_eval/tasks/afrimmlu/README.md b/lm_eval/tasks/afrimmlu/README.md
index b5129378..f7f7ed4d 100644
--- a/lm_eval/tasks/afrimmlu/README.md
+++ b/lm_eval/tasks/afrimmlu/README.md
@@ -5,8 +5,8 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
 
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
 
 
@@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
       author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
       year={2024},
       eprint={2406.03368},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```
 
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
index 8c42b8f5..47d16d95 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
@@ -9,18 +9,18 @@ output_type: multiple_choice
 validation_split: validation
 test_split: test
 fewshot_split: validation
-doc_to_text: !function utils.doc_to_text 
+doc_to_text: !function utils.doc_to_text
 doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
 doc_to_choice: !function utils.doc_to_choice
 should_decontaminate: true
 doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
-  - metric: f1 
-    aggregation: !function utils.weighted_f1_score 
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
     # aggregation: mean
-    average: weighted 
-    hf_evaluate: true 
-    higher_is_better: True 
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
     ignore_case: true
     ignore_punctuation: true
     regexes_to_ignore:
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
index f2add8a5..a1e647cd 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
@@ -1,4 +1,3 @@
 dataset_name: eng
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_eng
-
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
index e0b00c08..1cc45ddc 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
@@ -1,4 +1,3 @@
 dataset_name: ewe
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_ewe
-
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
index 6e3383b3..e6adb6c8 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
@@ -1,3 +1,3 @@
 dataset_name: fra
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_fra
\ No newline at end of file
+task: afrimmlu_direct_fra
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
index a28b083c..9cc9a1ae 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
@@ -1,3 +1,3 @@
 dataset_name: hau
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_hau
\ No newline at end of file
+task: afrimmlu_direct_hau
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
index f9a863f2..6abb2c4a 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
@@ -1,3 +1,3 @@
 dataset_name: ibo
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_ibo
\ No newline at end of file
+task: afrimmlu_direct_ibo
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
index bfd008d4..2f81f709 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
@@ -1,3 +1,3 @@
 dataset_name: kin
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_kin
\ No newline at end of file
+task: afrimmlu_direct_kin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
index a541b655..55363ed9 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
@@ -1,3 +1,3 @@
 dataset_name: lin
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_lin
\ No newline at end of file
+task: afrimmlu_direct_lin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
index b6d0be05..0d484427 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
@@ -1,3 +1,3 @@
 dataset_name: lug
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_lug
\ No newline at end of file
+task: afrimmlu_direct_lug
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
index 124bbe59..763eb8a7 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
@@ -1,3 +1,3 @@
 dataset_name: orm
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_orm
\ No newline at end of file
+task: afrimmlu_direct_orm
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
index 9d75eb85..ed9e69af 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
@@ -1,3 +1,3 @@
 dataset_name: sna
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_sna
\ No newline at end of file
+task: afrimmlu_direct_sna
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
index fba23339..acdba0fd 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
@@ -1,3 +1,3 @@
 dataset_name: sot
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_sot
\ No newline at end of file
+task: afrimmlu_direct_sot
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
index 099ecb0d..c1aa82b0 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
@@ -1,3 +1,3 @@
 dataset_name: swa
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_swa
\ No newline at end of file
+task: afrimmlu_direct_swa
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
index 2da2e627..2695d4a1 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
@@ -1,3 +1,3 @@
 dataset_name: twi
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_twi
\ No newline at end of file
+task: afrimmlu_direct_twi
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
index 938247ab..027f8376 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
@@ -1,3 +1,3 @@
 dataset_name: wol
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_wol
\ No newline at end of file
+task: afrimmlu_direct_wol
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
index bcaa7a62..8e0c1297 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
@@ -1,3 +1,3 @@
 dataset_name: xho
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_xho
\ No newline at end of file
+task: afrimmlu_direct_xho
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
index a83c8454..2a9f7645 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
@@ -1,3 +1,3 @@
 dataset_name: yor
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_yor
\ No newline at end of file
+task: afrimmlu_direct_yor
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
index a597d3bb..9d8d3b41 100644
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
@@ -1,3 +1,3 @@
 dataset_name: zul
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_zul
\ No newline at end of file
+task: afrimmlu_direct_zul
diff --git a/lm_eval/tasks/afrimmlu/direct/utils.py b/lm_eval/tasks/afrimmlu/direct/utils.py
index 2250b4b3..6d30579d 100644
--- a/lm_eval/tasks/afrimmlu/direct/utils.py
+++ b/lm_eval/tasks/afrimmlu/direct/utils.py
@@ -1,9 +1,4 @@
-import re
-import sys
-import unicodedata
-
 from sklearn.metrics import f1_score
-from lm_eval.filters.extraction import RegexFilter
 
 
 def doc_to_choice(doc):
@@ -12,9 +7,9 @@ def doc_to_choice(doc):
 
 
 def doc_to_text(doc):
-    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+    output = """You are a highly knowledgeable and intelligent artificial intelligence
                 model answers multiple-choice questions about {subject}
-                
+
                 Question: {question}
 
                 Choices:
@@ -22,16 +17,18 @@ def doc_to_text(doc):
                         B: {choice2}
                         C: {choice3}
                         D: {choice4}
-                       
+
                 Answer:  """
-    
+
     choices = eval(doc["choices"])
-    text = output.format(subject=doc['subject'],
-                         question=doc['question'],
-                         choice1=choices[0],
-                         choice2=choices[1],
-                         choice3=choices[2],
-                         choice4=choices[3])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
     return text
 
 
@@ -40,4 +37,4 @@ def weighted_f1_score(items):
     golds = unzipped_list[0]
     preds = unzipped_list[1]
     fscore = f1_score(golds, preds, average="weighted")
-    return fscore
\ No newline at end of file
+    return fscore
diff --git a/lm_eval/tasks/afrimmlu/fewshot.sh b/lm_eval/tasks/afrimmlu/fewshot.sh
index 42902ddd..c69c48d7 100644
--- a/lm_eval/tasks/afrimmlu/fewshot.sh
+++ b/lm_eval/tasks/afrimmlu/fewshot.sh
@@ -5,4 +5,4 @@ lm_eval --model hf \
         --batch_size 1 \
         --num_fewshot 0 \
         --verbosity DEBUG \
-        --wandb_args project=afrimmlu
\ No newline at end of file
+        --wandb_args project=afrimmlu
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
index 247a6779..2bc87d5a 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
@@ -6,18 +6,18 @@ dataset_path: masakhane/afrimmlu-translate-test
 dataset_name: null
 output_type: multiple_choice
 test_split: test
-doc_to_text: !function utils.doc_to_text 
+doc_to_text: !function utils.doc_to_text
 doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
 doc_to_choice: !function utils.doc_to_choice
 should_decontaminate: true
 doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
-  - metric: f1 
-    aggregation: !function utils.weighted_f1_score 
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
     # aggregation: mean
-    average: weighted 
-    hf_evaluate: true 
-    higher_is_better: True 
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
     ignore_case: true
     ignore_punctuation: true
     regexes_to_ignore:
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
index 30c50078..0be98bee 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
@@ -1,4 +1,3 @@
 dataset_name: eng
 include: afrimmlu_common_translate_yaml
 task: afrimmlu_translate_eng
-
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
index 3c764f9c..624342b9 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
@@ -1,4 +1,3 @@
 dataset_name: ewe
 include: afrimmlu_common_translate_yaml
 task: afrimmlu_translate_ewe
-
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
index 5401255c..c4fd7e1f 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
@@ -1,3 +1,3 @@
 dataset_name: fra
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_fra
\ No newline at end of file
+task: afrimmlu_translate_fra
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
index 590baa6e..aaeb415f 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
@@ -1,3 +1,3 @@
 dataset_name: hau
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_hau
\ No newline at end of file
+task: afrimmlu_translate_hau
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
index 25c306d1..93fb24e8 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
@@ -1,3 +1,3 @@
 dataset_name: ibo
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_ibo
\ No newline at end of file
+task: afrimmlu_translate_ibo
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
index ef917724..f39f6668 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
@@ -1,3 +1,3 @@
 dataset_name: kin
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_kin
\ No newline at end of file
+task: afrimmlu_translate_kin
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
index a07db090..c935ee47 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
@@ -1,3 +1,3 @@
 dataset_name: lin
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_lin
\ No newline at end of file
+task: afrimmlu_translate_lin
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
index f23a9472..72e4bce0 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
@@ -1,3 +1,3 @@
 dataset_name: lug
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_lug
\ No newline at end of file
+task: afrimmlu_translate_lug
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
index bee6ff84..3ff90249 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
@@ -1,3 +1,3 @@
 dataset_name: orm
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_orm
\ No newline at end of file
+task: afrimmlu_translate_orm
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
index ca99078d..9979740a 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
@@ -1,3 +1,3 @@
 dataset_name: sna
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_sna
\ No newline at end of file
+task: afrimmlu_translate_sna
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
index 3c617bc6..deb2b9b8 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
@@ -1,3 +1,3 @@
 dataset_name: sot
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_sot
\ No newline at end of file
+task: afrimmlu_translate_sot
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
index a4baad77..e58d90bc 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
@@ -1,3 +1,3 @@
 dataset_name: swa
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_swa
\ No newline at end of file
+task: afrimmlu_translate_swa
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
index 16436ff3..51a2d26a 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
@@ -1,3 +1,3 @@
 dataset_name: twi
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_twi
\ No newline at end of file
+task: afrimmlu_translate_twi
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
index 025139a1..006b6847 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
@@ -1,3 +1,3 @@
 dataset_name: wol
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_wol
\ No newline at end of file
+task: afrimmlu_translate_wol
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
index 4404ab61..c0bdf447 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
@@ -1,3 +1,3 @@
 dataset_name: xho
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_xho
\ No newline at end of file
+task: afrimmlu_translate_xho
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
index 48152ab2..0e7ba600 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
@@ -1,3 +1,3 @@
 dataset_name: yor
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_yor
\ No newline at end of file
+task: afrimmlu_translate_yor
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
index b4fc8dfb..a18d251c 100644
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
@@ -1,3 +1,3 @@
 dataset_name: zul
 include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_zul
\ No newline at end of file
+task: afrimmlu_translate_zul
diff --git a/lm_eval/tasks/afrimmlu/translate/utils.py b/lm_eval/tasks/afrimmlu/translate/utils.py
index 6054cf31..f53fb68f 100644
--- a/lm_eval/tasks/afrimmlu/translate/utils.py
+++ b/lm_eval/tasks/afrimmlu/translate/utils.py
@@ -7,9 +7,9 @@ def doc_to_choice(doc):
 
 
 def doc_to_text(doc):
-    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+    output = """You are a highly knowledgeable and intelligent artificial intelligence
                 model answers multiple-choice questions about '{subject}'
-                
+
                 Question: '''{question}'''
 
                 Choices:
@@ -17,16 +17,18 @@ def doc_to_text(doc):
                         B: ''{choice2}'''
                         C: ''{choice3}'''
                         D: ''{choice4}'''
-                       
+
                 Answer:  """
-    
+
     choices = eval(doc["choices"])
-    text = output.format(subject=doc['subject'],
-                         question=doc['question'],
-                         choice1=choices[0],
-                         choice2=choices[1],
-                         choice3=choices[2],
-                         choice4=choices[3])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
     return text
 
 
@@ -35,4 +37,4 @@ def weighted_f1_score(items):
     golds = unzipped_list[0]
     preds = unzipped_list[1]
     fscore = f1_score(golds, preds, average="weighted")
-    return fscore
\ No newline at end of file
+    return fscore
diff --git a/lm_eval/tasks/afrimmlu/utils.py b/lm_eval/tasks/afrimmlu/utils.py
index 6054cf31..f53fb68f 100644
--- a/lm_eval/tasks/afrimmlu/utils.py
+++ b/lm_eval/tasks/afrimmlu/utils.py
@@ -7,9 +7,9 @@ def doc_to_choice(doc):
 
 
 def doc_to_text(doc):
-    output = """You are a highly knowledgeable and intelligent artificial intelligence 
+    output = """You are a highly knowledgeable and intelligent artificial intelligence
                 model answers multiple-choice questions about '{subject}'
-                
+
                 Question: '''{question}'''
 
                 Choices:
@@ -17,16 +17,18 @@ def doc_to_text(doc):
                         B: ''{choice2}'''
                         C: ''{choice3}'''
                         D: ''{choice4}'''
-                       
+
                 Answer:  """
-    
+
     choices = eval(doc["choices"])
-    text = output.format(subject=doc['subject'],
-                         question=doc['question'],
-                         choice1=choices[0],
-                         choice2=choices[1],
-                         choice3=choices[2],
-                         choice4=choices[3])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
     return text
 
 
@@ -35,4 +37,4 @@ def weighted_f1_score(items):
     golds = unzipped_list[0]
     preds = unzipped_list[1]
     fscore = f1_score(golds, preds, average="weighted")
-    return fscore
\ No newline at end of file
+    return fscore
diff --git a/lm_eval/tasks/afrixnli/README.md b/lm_eval/tasks/afrixnli/README.md
index cf8e0ff2..65b0272b 100644
--- a/lm_eval/tasks/afrixnli/README.md
+++ b/lm_eval/tasks/afrixnli/README.md
@@ -5,8 +5,8 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
 
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
 
 
@@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
       author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
       year={2024},
       eprint={2406.03368},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```
 
@@ -30,7 +30,7 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 
 * `afrixnli`: All afrixnli tasks
 * `afrixnli_en_direct`: afrixnli_en_direct evaluates models performance using the anli prompt on the curated dataset
-* `afrixnli_native_direct`: afrixnli_native_direct evaluates models performance using the anli prompt translated to the 
+* `afrixnli_native_direct`: afrixnli_native_direct evaluates models performance using the anli prompt translated to the
 respective languages on the curated dataset
 * `afrixnli_translate`: afrixnli_translate evaluates models using the anli prompt in translate-test setting
 * `afrixnli_manual_direct`: afrixnli_manual_direct evaluates models performance using Lai's prompt on the curated dataset
diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py
index c4beaf8e..17df7ca9 100644
--- a/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py	
+++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py	
@@ -2,11 +2,7 @@ from sklearn.metrics import f1_score
 
 
 def doc_to_target(doc):
-    replacements = {
-        0: 'True',
-        1: 'Neither',
-        2: 'False'
-    }
+    replacements = {0: "True", 1: "Neither", 2: "False"}
     return replacements[doc["label"]]
 
 
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml
index 785ffe99..94fb2bdc 100644
--- a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml	
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml	
@@ -2,4 +2,3 @@
 dataset_name: amh
 include: afrixnli_translate_yaml
 task: afrixnli_translate_amh
-
diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py
index c4beaf8e..17df7ca9 100644
--- a/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py	
+++ b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py	
@@ -2,11 +2,7 @@ from sklearn.metrics import f1_score
 
 
 def doc_to_target(doc):
-    replacements = {
-        0: 'True',
-        1: 'Neither',
-        2: 'False'
-    }
+    replacements = {0: "True", 1: "Neither", 2: "False"}
     return replacements[doc["label"]]
 
 
diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py
index e8e3fb58..8f472503 100644
--- a/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py	
+++ b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py	
@@ -2,25 +2,20 @@ from sklearn.metrics import f1_score
 
 
 def doc_to_text(doc):
-    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise 
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
     and hypothesis. The answer should be exact entailment, contradiction, or neutral.
-    
+
     Premise: {premise}
     Hypothesis: {hypothesis}
-    
+
     Is it entailment, contradiction, or neutral?"""
 
-    text = output.format(premise=doc['premise'],
-                         hypothesis=doc['hypothesis'])
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
     return text
 
 
 def doc_to_target(doc):
-    replacements = {
-        0: 'entailment',
-        1: 'neutral',
-        2: 'contradiction'
-    }
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
     return replacements[doc["label"]]
 
 
diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py
index e8e3fb58..8f472503 100644
--- a/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py	
+++ b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py	
@@ -2,25 +2,20 @@ from sklearn.metrics import f1_score
 
 
 def doc_to_text(doc):
-    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise 
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
     and hypothesis. The answer should be exact entailment, contradiction, or neutral.
-    
+
     Premise: {premise}
     Hypothesis: {hypothesis}
-    
+
     Is it entailment, contradiction, or neutral?"""
 
-    text = output.format(premise=doc['premise'],
-                         hypothesis=doc['hypothesis'])
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
     return text
 
 
 def doc_to_target(doc):
-    replacements = {
-        0: 'entailment',
-        1: 'neutral',
-        2: 'contradiction'
-    }
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
     return replacements[doc["label"]]
 
 
diff --git a/lm_eval/tasks/afrixnli/utils.py b/lm_eval/tasks/afrixnli/utils.py
index 088f8fe6..905a72b0 100644
--- a/lm_eval/tasks/afrixnli/utils.py
+++ b/lm_eval/tasks/afrixnli/utils.py
@@ -1,6 +1,7 @@
-import yaml
 import argparse
 
+import yaml
+
 
 class FunctionTag:
     def __init__(self, value):
@@ -12,110 +13,110 @@ LANGUAGES = {
         "QUESTION_WORD": "ትክክል",
         "ENTAILMENT_LABEL": "አዎ",
         "NEUTRAL_LABEL": "እንዲሁም",
-        "CONTRADICTION_LABEL": "አይ"
+        "CONTRADICTION_LABEL": "አይ",
     },
     "eng": {
         "QUESTION_WORD": "Right",
         "ENTAILMENT_LABEL": "Yes",
         "NEUTRAL_LABEL": "Also",
-        "CONTRADICTION_LABEL": "No"
+        "CONTRADICTION_LABEL": "No",
     },
     "ewe": {
         "QUESTION_WORD": "Esɔ gbe",
         "ENTAILMENT_LABEL": "Ɛ̃",
         "NEUTRAL_LABEL": "Hã",
-        "CONTRADICTION_LABEL": "Ao"
+        "CONTRADICTION_LABEL": "Ao",
     },
     "fra": {
         "QUESTION_WORD": "correct",
         "ENTAILMENT_LABEL": "Oui",
         "NEUTRAL_LABEL": "Aussi",
-        "CONTRADICTION_LABEL": "Non"
+        "CONTRADICTION_LABEL": "Non",
     },
     "hau": {
         "QUESTION_WORD": "Daidai",
         "ENTAILMENT_LABEL": "Ee",
         "NEUTRAL_LABEL": "Haka kuma",
-        "CONTRADICTION_LABEL": "A'a"
+        "CONTRADICTION_LABEL": "A'a",
     },
     "ibo": {
         "QUESTION_WORD": "Ziri ezi",
         "ENTAILMENT_LABEL": "Éè",
         "NEUTRAL_LABEL": "Ọzọkwa",
-        "CONTRADICTION_LABEL": "Mba"
+        "CONTRADICTION_LABEL": "Mba",
     },
     "kin": {
         "QUESTION_WORD": "Nibyo",
         "ENTAILMENT_LABEL": "Yego",
         "NEUTRAL_LABEL": "Na none",
-        "CONTRADICTION_LABEL": "Oya"
+        "CONTRADICTION_LABEL": "Oya",
     },
     "lin": {
         "QUESTION_WORD": "Malamu",
         "ENTAILMENT_LABEL": "Iyo",
         "NEUTRAL_LABEL": "Lisusu",
-        "CONTRADICTION_LABEL": "Te"
+        "CONTRADICTION_LABEL": "Te",
     },
     "lug": {
         "QUESTION_WORD": "Kituufu",
         "ENTAILMENT_LABEL": "Yee",
         "NEUTRAL_LABEL": "N’ekirala",
-        "CONTRADICTION_LABEL": "Nedda"
+        "CONTRADICTION_LABEL": "Nedda",
     },
     "orm": {
         "QUESTION_WORD": "Sirrii",
         "ENTAILMENT_LABEL": "Eeyyee",
         "NEUTRAL_LABEL": "Akkasumas",
-        "CONTRADICTION_LABEL": "Lakki"
+        "CONTRADICTION_LABEL": "Lakki",
     },
     "sna": {
         "QUESTION_WORD": "Chokwadi",
         "ENTAILMENT_LABEL": "Hongu",
         "NEUTRAL_LABEL": "Uye",
-        "CONTRADICTION_LABEL": "Kwete"
+        "CONTRADICTION_LABEL": "Kwete",
     },
     "sot": {
         "QUESTION_WORD": "Nepile",
         "ENTAILMENT_LABEL": "E",
         "NEUTRAL_LABEL": "Hape",
-        "CONTRADICTION_LABEL": "Tjhe"
+        "CONTRADICTION_LABEL": "Tjhe",
     },
     "swa": {
         "QUESTION_WORD": "Sahihi",
         "ENTAILMENT_LABEL": "Ndiyo",
         "NEUTRAL_LABEL": "Pia",
-        "CONTRADICTION_LABEL": "Hapana"
+        "CONTRADICTION_LABEL": "Hapana",
     },
     "twi": {
         "QUESTION_WORD": "Nifa",
         "ENTAILMENT_LABEL": "Aane",
         "NEUTRAL_LABEL": "Anaasɛ",
-        "CONTRADICTION_LABEL": "Daabi"
+        "CONTRADICTION_LABEL": "Daabi",
     },
     "wol": {
         "QUESTION_WORD": "Dëgg",
         "ENTAILMENT_LABEL": "Waaw",
         "NEUTRAL_LABEL": "Itam",
-        "CONTRADICTION_LABEL": "Déet"
+        "CONTRADICTION_LABEL": "Déet",
     },
     "xho": {
         "QUESTION_WORD": "Ichanekile",
         "ENTAILMENT_LABEL": "Ewe",
         "NEUTRAL_LABEL": "Kananjalo",
-        "CONTRADICTION_LABEL": "Hayi"
+        "CONTRADICTION_LABEL": "Hayi",
     },
     "yor": {
         "QUESTION_WORD": "Òótọ́",
         "ENTAILMENT_LABEL": "Bẹ́ẹ̀ni",
         "NEUTRAL_LABEL": "Àti pé",
-        "CONTRADICTION_LABEL": "Rárá"
+        "CONTRADICTION_LABEL": "Rárá",
     },
     "zul": {
         "QUESTION_WORD": "Kulungile",
         "ENTAILMENT_LABEL": "Yebo",
         "NEUTRAL_LABEL": "Futhi",
-        "CONTRADICTION_LABEL": "Cha"
-    }
+        "CONTRADICTION_LABEL": "Cha",
+    },
 }
 
 
@@ -127,8 +128,26 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
     :param overwrite: Whether to overwrite files if they already exist.
     """
     err = []
-    languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'wol', 'ewe', 'lin', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm',
-                 'yor', 'hau', 'sot', 'swa']
+    languages = [
+        "eng",
+        "amh",
+        "ibo",
+        "fra",
+        "sna",
+        "wol",
+        "ewe",
+        "lin",
+        "lug",
+        "xho",
+        "kin",
+        "twi",
+        "zul",
+        "orm",
+        "yor",
+        "hau",
+        "sot",
+        "swa",
+    ]
     for lang in languages:
         try:
             if mode == "native-direct":
@@ -141,7 +160,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                 task_name = f"afrixnli_native_direct_{lang}"
                 yaml_template = "afrixnli_native_direct_yaml"
                 with open(
-                        f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                    f"{output_dir}/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
                 ) as f:
                     f.write("# Generated by utils.py\n")
                     yaml.dump(
@@ -150,10 +171,10 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                             "task": task_name,
                             "dataset_name": lang,
                             "doc_to_choice": f"{{{{["
-                                           f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
-                                           f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
-                                           f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
-                                           f"]}}}}",
+                            f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
+                            f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
+                            f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
+                            f"]}}}}",
                         },
                         f,
                         allow_unicode=True,
@@ -163,14 +184,16 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                 task_name = f"afrixnli_{mode}_{lang}"
                 yaml_template = f"afrixnli_{mode}_yaml"
                 with open(
-                        f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                    f"{output_dir}/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
                 ) as f:
                     f.write("# Generated by utils.py\n")
                     yaml.dump(
                         {
                             "include": yaml_template,
                             "task": task_name,
-                            "dataset_name": lang
+                            "dataset_name": lang,
                         },
                         f,
                         allow_unicode=True,
@@ -195,7 +218,9 @@ def main() -> None:
         help="Overwrite files if they already exist",
     )
     parser.add_argument(
-        "--output-dir", default="./manual/translate", help="Directory to write yaml files to"
+        "--output-dir",
+        default="./manual/translate",
+        help="Directory to write yaml files to",
     )
     parser.add_argument(
         "--mode",
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
index b457ae58..2e34ad36 100644
--- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml
@@ -3,4 +3,4 @@ task:
   - med_concepts_qa_atc_tasks
 aggregate_metric_list:
   - metric: acc
-    aggregation: mean
\ No newline at end of file
+    aggregation: mean
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
index 777a5ce6..407ea408 100644
--- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml
@@ -3,4 +3,4 @@ task:
   - med_concepts_qa_icd10proc_tasks
 aggregate_metric_list:
   - metric: acc
-    aggregation: mean
\ No newline at end of file
+    aggregation: mean
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
index d5671bf0..b12ea811 100644
--- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml
@@ -3,4 +3,4 @@ task:
   - med_concepts_qa_icd9cm_tasks
 aggregate_metric_list:
   - metric: acc
-    aggregation: mean
\ No newline at end of file
+    aggregation: mean
diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
index 4487772f..94fc034e 100644
--- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
+++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml
@@ -3,4 +3,4 @@ task:
   - med_concepts_qa_icd9proc_tasks
 aggregate_metric_list:
   - metric: acc
-    aggregation: mean
\ No newline at end of file
+    aggregation: mean
-- 
GitLab


From 1adab703e9320ea23b2a7008ec3377a3a4d82cfd Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:39:10 -0500
Subject: [PATCH 39/45] docs: align local test command to match CI (#2100)

Also add 'test_logs/' to .gitignore

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .gitignore           | 1 +
 docs/CONTRIBUTING.md | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 020622df..c9278761 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ temp
 __pycache__
 .ipynb_checkpoints
 temp
+test_logs/
 # IPython
 profile_default/
 ipython_config.py
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index bec68747..f4b47860 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -30,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing.
 We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
 
 ```
-python -m pytest --ignore=tests/tests_master --ignore=tests/extra
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
 ```
 
 ## Contributor License Agreement
-- 
GitLab


From 69502c0631281c527a428316db962a13a68742be Mon Sep 17 00:00:00 2001
From: jab13x <117719136+jab13x@users.noreply.github.com>
Date: Wed, 17 Jul 2024 10:28:42 +0200
Subject: [PATCH 40/45] Fixed colon in Belebele _default_template_yaml (#2111)

---
 lm_eval/tasks/belebele/_default_template_yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
index 901feb77..c32b310b 100644
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -4,7 +4,7 @@ fewshot_config:
 output_type: multiple_choice
 should_decontaminate: true
 doc_to_decontamination_query: "{{question}}"
-doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer："
+doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer:"
 doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
 metric_list:
-- 
GitLab


From 9d4a04a0aac37fff88ca1a04d667012e1502c21d Mon Sep 17 00:00:00 2001
From: Jungwhan Kim <53588015+jungwhank@users.noreply.github.com>
Date: Thu, 18 Jul 2024 13:39:24 +0900
Subject: [PATCH 41/45] [python] fix haerae tasks (#2112)

---
 lm_eval/tasks/haerae/_haerae.yaml   | 10 +++++-----
 lm_eval/tasks/haerae/haerae_gk.yaml |  6 +++---
 lm_eval/tasks/haerae/haerae_hi.yaml |  6 +++---
 lm_eval/tasks/haerae/haerae_lw.yaml |  6 +++---
 lm_eval/tasks/haerae/haerae_rw.yaml |  6 +++---
 lm_eval/tasks/haerae/haerae_sn.yaml |  6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/lm_eval/tasks/haerae/_haerae.yaml b/lm_eval/tasks/haerae/_haerae.yaml
index 05a40502..acf413fb 100644
--- a/lm_eval/tasks/haerae/_haerae.yaml
+++ b/lm_eval/tasks/haerae/_haerae.yaml
@@ -1,10 +1,10 @@
 group: haerae
 task:
-  - haerae_gk
-  - haerae_hi
-  - haerae_lw
-  - haerae_rw
-  - haerae_sn
+  - haerae_general_knowledge
+  - haerae_history
+  - haerae_loan_word
+  - haerae_rare_word
+  - haerae_standard_nomenclature
 aggregate_metric_list:
   - metric: acc
     aggregation: mean
diff --git a/lm_eval/tasks/haerae/haerae_gk.yaml b/lm_eval/tasks/haerae/haerae_gk.yaml
index 2ccd2610..97d2bd71 100644
--- a/lm_eval/tasks/haerae/haerae_gk.yaml
+++ b/lm_eval/tasks/haerae/haerae_gk.yaml
@@ -1,3 +1,3 @@
-"dataset_name": "general_knowledge"
-"include": "_default_haerae_yaml"
-"task": "haerae_general_knowledge"
+dataset_name: general_knowledge
+include: _default_haerae_yaml
+task: haerae_general_knowledge
diff --git a/lm_eval/tasks/haerae/haerae_hi.yaml b/lm_eval/tasks/haerae/haerae_hi.yaml
index 15f77efd..ed366912 100644
--- a/lm_eval/tasks/haerae/haerae_hi.yaml
+++ b/lm_eval/tasks/haerae/haerae_hi.yaml
@@ -1,3 +1,3 @@
-"dataset_name": "history"
-"include": "_default_haerae_yaml"
-"task": "haerae_history"
+dataset_name: history
+include: _default_haerae_yaml
+task: haerae_history
diff --git a/lm_eval/tasks/haerae/haerae_lw.yaml b/lm_eval/tasks/haerae/haerae_lw.yaml
index af143afe..1cb57278 100644
--- a/lm_eval/tasks/haerae/haerae_lw.yaml
+++ b/lm_eval/tasks/haerae/haerae_lw.yaml
@@ -1,3 +1,3 @@
-"dataset_name": "loan_words"
-"include": "_default_haerae_yaml"
-"task": "haerae_loan_word"
+dataset_name: loan_words
+include: _default_haerae_yaml
+task: haerae_loan_word
diff --git a/lm_eval/tasks/haerae/haerae_rw.yaml b/lm_eval/tasks/haerae/haerae_rw.yaml
index bc8d182e..cfa2d1cf 100644
--- a/lm_eval/tasks/haerae/haerae_rw.yaml
+++ b/lm_eval/tasks/haerae/haerae_rw.yaml
@@ -1,3 +1,3 @@
-"dataset_name": "rare_words"
-"include": "_default_haerae_yaml"
-"task": "haerae_rare_word"
+dataset_name: rare_words
+include: _default_haerae_yaml
+task: haerae_rare_word
diff --git a/lm_eval/tasks/haerae/haerae_sn.yaml b/lm_eval/tasks/haerae/haerae_sn.yaml
index 62603ba1..66bf43e0 100644
--- a/lm_eval/tasks/haerae/haerae_sn.yaml
+++ b/lm_eval/tasks/haerae/haerae_sn.yaml
@@ -1,3 +1,3 @@
-"dataset_name": "standard_nomenclature"
-"include": "_default_haerae_yaml"
-"task": "haerae_standard_nomenclature"
+dataset_name: standard_nomenclature
+include: _default_haerae_yaml
+task: haerae_standard_nomenclature
-- 
GitLab


From 8f8e7f6e4029251f4e2d1464f05eae9af9f96b11 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Thu, 18 Jul 2024 15:49:13 -0500
Subject: [PATCH 42/45] fix: broken discord link in CONTRIBUTING.md (#2114)

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 docs/CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index f4b47860..48b5c332 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -9,7 +9,7 @@ There are several places information about LM Evaluation Harness is located:
 - Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
 - We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
 - We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
-- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai).
 
 ## Code Style
 
-- 
GitLab


From feff1b55c57993c4d42c8f913a22eeec395cd690 Mon Sep 17 00:00:00 2001
From: Jennifer Cwagenberg <candiedcode@gmail.com>
Date: Fri, 19 Jul 2024 22:05:56 -0500
Subject: [PATCH 43/45] docs: update truthfulqa tasks (#2119)

---
 lm_eval/tasks/truthfulqa/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/truthfulqa/README.md b/lm_eval/tasks/truthfulqa/README.md
index f29db6ca..c5cb1689 100644
--- a/lm_eval/tasks/truthfulqa/README.md
+++ b/lm_eval/tasks/truthfulqa/README.md
@@ -36,8 +36,8 @@ Homepage: `https://github.com/sylinrl/TruthfulQA`
 #### Tasks
 
 * `truthfulqa_mc1`: `Multiple-choice, single answer`
-* (MISSING)`truthfulqa_mc2`: `Multiple-choice, multiple answers`
-* (MISSING)`truthfulqa_gen`: `Answer generation`
+* `truthfulqa_mc2`: `Multiple-choice, multiple answers`
+* `truthfulqa_gen`: `Answer generation`
 
 ### Checklist
 
-- 
GitLab


From 4a62757d0086fd403b703f5cdc6ee7c68b954a5b Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Sun, 21 Jul 2024 12:18:03 -0400
Subject: [PATCH 44/45] fix caching module (hotfix for now) (#2124)

---
 lm_eval/caching/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 lm_eval/caching/__init__.py

diff --git a/lm_eval/caching/__init__.py b/lm_eval/caching/__init__.py
new file mode 100644
index 00000000..e69de29b
-- 
GitLab


From 42dc244867889a19ae80847254a481f446f6e4b7 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Tue, 23 Jul 2024 04:27:51 +0500
Subject: [PATCH 45/45] Refactor API models (#2008)

* refactor pad_token handling to fn

* fix docs

* add pad_token_handling to vllm

* start on API superclass

* don't detokenize the returned logits

* streamline vllm tokenizer

* add type hint

* pre-commit

* seems to be in working order

* add model to init

* refactor api models

* nit

* cleanup

* add pbar

* fix type hints

* change optional dependencies

* json encode chat template

* add type hints

* deal with different prompt input requiremnts

* nits

* fix

* cache inside async

* fix

* fix

* nits

* nits

* nits

* nit

* fixup

* fixup

* nit

* add dummy retry

* add dummy retry

* handle imports; skip failing test

* add type hint

* add tests

* add dependency to tests

* add package names to exception

* nit

* docs; type hints

* handle api key

* nit

* tokenizer bug

* fix tokenizer

* nit

* nit

* add better error messages

* nit

* remove decorator

* CI: install api dep

* revert evaluator.py

* consolidate

* consolidate

* nits

* nit

* fix typealias

* nit

* nit

* nit

* Update lm_eval/models/api_models.py

typo

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/openai_completions.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/anthropic_llms.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/api_models.py

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* fix typo

* add news section

* add info for API

* pre-commit

* typo

* fix bug: unpack logliklehood requests

* fix bug: shared gen_kwargs mutated

* nit: handle copy properly

* Update README.md

* Update README.md

* Update README.md

* Update api_models.py

* Update README.md

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 .github/workflows/unit_tests.yml     |   4 +-
 README.md                            |  90 ++--
 lm_eval/api/model.py                 |  24 +-
 lm_eval/models/__init__.py           |   1 +
 lm_eval/models/anthropic_llms.py     | 168 +++----
 lm_eval/models/api_models.py         | 622 ++++++++++++++++++++++++++
 lm_eval/models/openai_completions.py | 627 ++++++++-------------------
 pyproject.toml                       |   3 +-
 tests/models/test_api.py             | 141 ++++++
 9 files changed, 1095 insertions(+), 585 deletions(-)
 create mode 100644 lm_eval/models/api_models.py
 create mode 100644 tests/models/test_api.py

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index f9dcaf4c..49b85fb9 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -56,7 +56,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
@@ -84,7 +84,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Test with pytest
       run: python -m pytest tests/models --showlocals -s -vv
     - name: Archive artifacts
diff --git a/README.md b/README.md
index 835f7472..1fbec92d 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,15 @@
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836)
 
+---
+
+*Latest News 📣*
+
+- [2024/07] API model support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
+- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
+
+---
+
 ## Announcement
 **A new v0.4.0 release of lm-evaluation-harness is available** !
 
@@ -21,6 +30,8 @@ Please see our updated documentation pages in `docs/` for more details.
 
 Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!
 
+---
+
 ## Overview
 
 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
@@ -112,7 +123,7 @@ For cases where your model can fit on a single GPU, this allows you to evaluate
 
 The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*
 
-In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
+In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
 
 ```
 lm_eval --model hf \
@@ -217,26 +228,26 @@ lm_eval --model openai-completions \
 We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.
 
 ```bash
-lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
+lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1,num_concurrent=1,max_retries=3,tokenized_requests=False
 ```
 Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
 
-| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                                | Models supported:                                                                             | Request Types:                                             |
-|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
-| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions`, `local-completions` | All OpenAI Completions API models                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions`                                                               | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
-| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
-| Anthropic Chat                                                                                                                | :heavy_check_mark:              | `anthropic-chat`, `anthropic-chat-completions`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)  | `generate_until` (no logprobs)                             |
-| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
-| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
-| Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                 |     Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                           |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                 |     Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                 |     Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
-| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                 |     Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface                                  | `generate_until`                                           |                                | ...                |
+| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                | Models supported:                                                                                                                                                                                                                                                                                                                                          | Request Types:                                             |
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------|
+| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions`, `local-completions`           | All OpenAI Completions API models                                                                                                                                                                                                                                                                                                                          | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                                                                                                                                                                                                                                                                              | `generate_until` (no logprobs)                             |
+| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)                                                                                                                                                                                                                                                               | `generate_until` (no logprobs)                             |
+| Anthropic Chat                                                                                                                | :heavy_check_mark:              | `anthropic-chat`, `anthropic-chat-completions`      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)                                                                                                                                                                                                                                                                      | `generate_until` (no logprobs)                             |
+| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                                                                                                                                                                                                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                                                                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                                                                                                                                                                                                                                                                                | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
+| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
+| Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
+| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |
 
 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
 
@@ -436,28 +447,27 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
 ## Optional Extras
 Extras dependencies can be installed via `pip install -e ".[NAME]"`
 
-| Name          | Use                                   |
-|---------------|---------------------------------------|
-| anthropic     | For using Anthropic's models          |
-| deepsparse     | For running NM's DeepSparse models    |
-| dev           | For linting PRs and contributions     |
-| gptq          | For loading models with GPTQ          |
-| hf_transfer   | For speeding up HF Hub file downloads |
-| ifeval        | For running the IFEval task           |
-| neuronx       | For running on AWS inf2 instances     |
-| mamba         | For loading Mamba SSM models          |
-| math          | For running math task answer checking |
-| multilingual  | For multilingual tokenizers           |
-| openai        | For using OpenAI's models             |
-| optimum       | For running Intel OpenVINO models     |
-| promptsource  | For using PromptSource prompts        |
-| sentencepiece | For using the sentencepiece tokenizer |
-| sparseml      | For using NM's SparseML models        |
-| testing       | For running library test suite        |
-| vllm          | For loading models with vLLM          |
-| zeno          | For visualizing results with Zeno     |
-|---------------|---------------------------------------|
-| all           | Loads all extras (not recommended)    |
+| Name            | Use                                          |
+|-----------------|----------------------------------------------|
+| api             | For using api models (Anthropic, OpenAI API) |
+| deepsparse      | For running NM's DeepSparse models           |
+| dev             | For linting PRs and contributions            |
+| gptq            | For loading models with GPTQ                 |
+| hf_transfer     | For speeding up HF Hub file downloads        |
+| ifeval          | For running the IFEval task                  |
+| neuronx         | For running on AWS inf2 instances            |
+| mamba           | For loading Mamba SSM models                 |
+| math            | For running math task answer checking        |
+| multilingual    | For multilingual tokenizers                  |
+| optimum         | For running Intel OpenVINO models            |
+| promptsource    | For using PromptSource prompts               |
+| sentencepiece   | For using the sentencepiece tokenizer        |
+| sparseml        | For using NM's SparseML models               |
+| testing         | For running library test suite               |
+| vllm            | For loading models with vLLM                 |
+| zeno            | For visualizing results with Zeno            |
+| --------------- | ---------------------------------------      |
+| all             | Loads all extras (not recommended)           |
 
 ## Cite as
 
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 5eecc617..a3602736 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -55,7 +55,7 @@ class LM(abc.ABC):
         pass
 
     @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[Tuple[float]]:
+    def loglikelihood_rolling(self, requests) -> List[float]:
         """Compute full log-likelihood of a string, with no truncation, for perplexity computation
         - We will use the full max context length of the model.
         - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
@@ -101,14 +101,13 @@ class LM(abc.ABC):
         """Generate greedily until a stopping sequence
 
         :param requests: list[Instance]
-            A list of Instance objects with property `args` which returns a tuple (context, until).
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
             context: str
                 Context string
-            until: [str]
-                The string sequences to generate until. These string sequences
-                may each span across multiple tokens, or may be part of one token.
+            gen_kwargs: dict
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
         :return: list[str]
-            A list of strings continuation
+            A list of model generated continuations.
             continuation: str
                 The generated continuation.
         """
@@ -325,14 +324,19 @@ class TemplateLM(LM):
         return self.eot_token_id
 
     @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs):
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
         pass
 
     @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs):
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
         pass
 
-    def _encode_pair(self, context, continuation):
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
         n_spaces = len(context) - len(context.rstrip())
         if n_spaces > 0:
             continuation = context[-n_spaces:] + continuation
@@ -373,7 +377,7 @@ class TemplateLM(LM):
     @abc.abstractmethod
     def loglikelihood_rolling(
         self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
+    ) -> List[float]:
         pass
 
     @abc.abstractmethod
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index 698c912f..0b2441eb 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -1,5 +1,6 @@
 from . import (
     anthropic_llms,
+    api_models,
     dummy,
     gguf,
     huggingface,
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index d011fe07..81efbe90 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -1,10 +1,13 @@
-from typing import Any, List, Tuple
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union
 
 from tqdm import tqdm
 
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
 from lm_eval.models.utils import retry_on_specific_exceptions
 
 
@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
     return messages()
 
 
-@register_model("anthropic")
+@register_model("anthropic-completions")
 class AnthropicLM(LM):
     REQ_CHUNK_SIZE = 20  # TODO: not used
 
@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
 
 
 @register_model("anthropic-chat", "anthropic-chat-completions")
-class AnthropicChatLM(AnthropicLM):
-    REQ_CHUNK_SIZE = 20  # TODO: not used
-
+class AnthropicChat(LocalCompletionsAPI):
     def __init__(
         self,
-        model: str,
-        batch_size: int = 1,
-        max_tokens: int = 256,
-        temperature: float = 0,  # defaults to 1
-        **kwargs,  # top_p, top_k, etc.
-    ) -> None:
-        """Anthropic API wrapper.
-
-        :param model: str
-            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
-        :param max_tokens: int
-            Maximum number of tokens to sample from the model
-        :param temperature: float
-            Sampling temperature
-        :param kwargs: Any
-            Additional model_args to pass to the API client
-        """
-        super().__init__()
+        base_url="https://api.anthropic.com/v1/messages",
+        tokenizer_backend=None,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+        eval_logger.warning(
+            "Chat completions does not support batching. Defaulting to batch size 1."
+        )
+        self._batch_size = 1
+        self.anthropic_version = "2023-06-01"
+        eval_logger.warning(
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
+        )
 
-        try:
-            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
             )
-
-        self.model = model
-        # defaults to os.environ.get("ANTHROPIC_API_KEY")
-        self.client = anthropic.Anthropic()
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.tokenizer = self.client.get_tokenizer()
-        self.kwargs = kwargs
-
-    @property
-    def max_gen_toks(self) -> int:
-        return self.max_tokens
-
-    def generate_until(self, requests) -> List[str]:
-        try:
-            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-            )
-
-        if not requests:
-            return []
-
-        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
-
+        return key
+
+    @cached_property
+    def header(self):
+        return {
+            "x-api-key": f"{self.api_key}",
+            "anthropic-version": self.anthropic_version,
+        }
+
+    def _create_payload(
+        self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
+    ) -> dict:
+        system = (
+            messages[0].get("content") if messages[0].get("role") == "system" else None
+        )
+        if system:
+            messages = messages[1:]
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["\n\nHuman:"])
+        if not isinstance(stop, list):
+            stop = [stop]
+        out = {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
         res = []
-        for request in tqdm(_requests):
-            try:
-                inp = request[0]
-                request_args = request[1]
-                # generation_kwargs
-                until = request_args.get("until")
-                max_tokens = request_args.get("max_gen_toks", self.max_length)
-                temperature = request_args.get("temperature", self.temperature)
-                response = anthropic_chat(
-                    client=self.client,
-                    model=self.model,
-                    prompt=inp,
-                    max_tokens=max_tokens,
-                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
-                    stop=until,  # type: ignore
-                    **self.kwargs,
-                )
-                res.append(response)
-
-                self.cache_hook.add_partial("generate_until", request, response)
-            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"Server unreachable: {e.__cause__}")
-                break
-            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"API error {e.status_code}: {e.message}")
-                break
-
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["content"]:
+                res.append(choices["text"])
         return res
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+
+    def _loglikelihood_tokens(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of log"
+        )
diff --git a/lm_eval/models/api_models.py b/lm_eval/models/api_models.py
new file mode 100644
index 00000000..af7dae59
--- /dev/null
+++ b/lm_eval/models/api_models.py
@@ -0,0 +1,622 @@
+import abc
+import asyncio
+import copy
+import itertools
+import json
+from collections import namedtuple
+from functools import cached_property
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
+
+
+try:
+    import requests
+    from aiohttp import ClientSession, TCPConnector
+    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
+    from tqdm import tqdm
+    from tqdm.asyncio import tqdm_asyncio
+except ModuleNotFoundError:
+    pass
+
+
+from importlib.util import find_spec
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.utils import Collator, chunks, configure_pad_token
+
+
+LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+JsonChatStr = namedtuple("JsonChatStr", ["prompt"])
+
+eval_logger = utils.eval_logger
+
+
+class TemplateAPI(TemplateLM):
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        # Logliklehood tasks require a tokenizer to calculate context lengths,
+        # however the requests can be sent as a string if the API doesn't support token inputs.
+        # use tokenized_requests=False
+        tokenizer_backend: Optional[
+            Literal["tiktoken", "huggingface", None]
+        ] = "huggingface",
+        truncate: bool = False,
+        # number of concurrent requests. More useful if not batching
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        max_gen_toks: int = 256,
+        batch_size: Union[str, int] = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = 2048,
+        add_bos_token: bool = False,
+        custom_prefix_token_id=None,
+        # send the requests as tokens or strings
+        tokenized_requests=True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        missing_packages = [
+            pkg
+            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
+            if find_spec(pkg) is None
+        ]
+        if missing_packages:
+            raise ModuleNotFoundError(
+                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
+                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
+            )
+        self.model = model or pretrained
+        self.base_url = base_url
+        self.tokenizer = tokenizer
+        if not isinstance(batch_size, int) and "auto" in batch_size:
+            eval_logger.warning(
+                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
+            )
+        elif int(batch_size) > 1:
+            eval_logger.warning(
+                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
+            )
+        self._batch_size = int(batch_size) if batch_size != "auto" else 1
+        self._truncate = truncate
+        self._max_gen_toks = int(max_gen_toks)
+        self._seed = int(seed)
+        self.max_length = max_length
+        if int(num_concurrent) <= 1:
+            eval_logger.info(
+                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent > 1`."
+            )
+        self._concurrent = int(num_concurrent)
+        self.tokenizer_backend = tokenizer_backend
+        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = custom_prefix_token_id
+        self.tokenized_requests = tokenized_requests
+        self.max_retries = int(max_retries)
+
+        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
+        if self.tokenizer_backend is None:
+            self.tokenizer = None
+            self.tokenized_requests = False
+        else:
+            if self.tokenizer_backend == "huggingface":
+                import transformers
+
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    self.tokenizer if self.tokenizer else self.model
+                )
+                # Not used as the API will handle padding but to mirror the behavior of the HFLM
+                self.tokenizer = configure_pad_token(self.tokenizer)
+            elif self.tokenizer_backend == "tiktoken":
+                try:
+                    import tiktoken
+
+                    self.tokenizer = tiktoken.encoding_for_model(self.model)
+                except ModuleNotFoundError as e:
+                    raise Exception(
+                        "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
+                        "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
+                    ) from e
+                if "openai" not in self.base_url:
+                    eval_logger.warning(
+                        f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
+                        "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                    )
+
+    @abc.abstractmethod
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[dict] = None,
+        **kwargs,
+    ) -> dict:
+        """This method is responsible for creating the json payload that will be sent to the API."""
+        raise NotImplementedError
+
+    def create_message(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        generate=False,
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
+        if isinstance(messages[0], JsonChatStr):
+            # for chat completions we need to decode the json string to list[dict,...]
+            assert (
+                self._batch_size == 1
+            ), "non-tokenized chat requests are only supported with batch_size=1"
+            # list[dict["role":..., "content":...],...]
+            return json.loads(messages[0].prompt)
+
+        if not self.tokenized_requests:
+            # if messages are tokenized:
+            if isinstance(messages[0][0], int):
+                # assuming decoding is lossless. However, this is only for logliklehood requests
+                # as we need to compute the context length. For generations, we don't need to tokenize.
+                messages = self.decode_batch(messages)
+            if self._batch_size <= 1:
+                # if batch is 1 return str
+                return messages[0]
+            else:
+                # list[str,...]
+                return messages
+
+        # list[list[int], ...]
+        return messages
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]],
+        tokens: List[List[int]] = None,
+        ctxlen: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
+        raise NotImplementedError
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
+        raise NotImplementedError
+
+    @cached_property
+    def api_key(self) -> str:
+        """Override this property to return the API key for the API request."""
+        return ""
+
+    @cached_property
+    def header(self) -> dict:
+        """Override this property to return the headers for the API request."""
+        return {"Authorization": f"Bearer {self.api_key}"}
+
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        Only used for logging and reproducibility.
+        """
+        return ""
+
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        return ""
+
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> Union[str, JsonChatStr]:
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            # bit of a hack. We'll load back before sending to the API
+            return JsonChatStr(json.dumps(chat_history))
+
+    @cached_property
+    def eot_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.eot_token
+
+    @cached_property
+    def prefix_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.custom_prefix_token_id is not None:
+                return self.custom_prefix_token_id
+            if self.tokenizer_backend == "huggingface":
+                if self.tokenizer.bos_token_id is not None:
+                    return self.tokenizer.bos_token_id
+                return self.tokenizer.eos_token_id
+            else:
+                return self.tokenizer.eot_token
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        **kwargs,
+    ) -> Union[List[List[int]], List[int], List[str]]:
+        if self.tokenizer_backend is None:
+            return [string]
+        elif self.tokenizer_backend == "huggingface":
+            # by default for CausalLM - false or self.add_bos_token is set
+            if not add_special_tokens:
+                add_special_tokens = False or self.add_bos_token
+            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+                string,
+                add_special_tokens=add_special_tokens,
+                truncation=truncation,
+                return_attention_mask=False,
+            ).input_ids
+
+            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+            if left_truncate_len:
+                if not isinstance(string, str):
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+                else:
+                    encoding = encoding[-left_truncate_len:]
+
+            return encoding
+
+        else:
+            try:
+                encoding = self.tokenizer.encode(string)
+            except Exception:
+                encoding = self.tokenizer.encode_batch(string)
+            return encoding
+
+    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+        if self.tokenizer_backend == "huggingface":
+            return self.tokenizer.batch_decode(tokens)
+        elif self.tokenizer_backend == "tiktoken":
+            return self.tokenizer.decode_batch(tokens)
+
+    def model_call(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Optional[dict]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        try:
+            response = requests.post(
+                self.base_url,
+                json=self._create_payload(
+                    self.create_message(messages),
+                    generate=generate,
+                    gen_kwargs=gen_kwargs,
+                    **kwargs,
+                ),
+                headers=self.header,
+            )
+            if not response.ok:
+                eval_logger.warning(
+                    f"API request failed with error message: {response.text}. Retrying..."
+                )
+            response.raise_for_status()
+            return response.json()
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    async def amodel_call(
+        self,
+        session: ClientSession,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        cache_keys: list = None,
+        ctxlens: Optional[List[int]] = None,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        payload = self._create_payload(
+            self.create_message(messages),
+            generate=generate,
+            gen_kwargs=gen_kwargs,
+            **kwargs,
+        )
+        cache_method = "generate_until" if generate else "loglikelihood"
+        try:
+            async with session.post(
+                self.base_url,
+                json=payload,
+                headers=self.header,
+            ) as response:
+                if not response.ok:
+                    error_text = await response.text()
+                    eval_logger.warning(
+                        f"API request failed with error message: {error_text}. Retrying..."
+                    )
+                # raising exception will retry the request
+                response.raise_for_status()
+                outputs = await response.json()
+            answers = (
+                self.parse_generations(
+                    outputs=outputs,
+                )
+                if generate
+                else self.parse_logprobs(
+                    outputs=outputs,
+                    tokens=messages,
+                    ctxlens=ctxlens,
+                )
+            )
+            if cache_keys:
+                for res, cache in zip(answers, cache_keys):
+                    self.cache_hook.add_partial(cache_method, cache, res)
+            return answers
+        # If the retries also fail
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    def batch_logliklehood_requests(
+        self, chunks: Iterable[List[LogLikelihoodInputs]]
+    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+        inputs = []
+        ctxlens = []
+        cache_keys = []
+        for chunk in chunks:
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+                cache_keys.append(cache_key)
+        return inputs, ctxlens, cache_keys
+
+    async def get_batched_requests(
+        self,
+        requests: list,
+        cache_keys: list,
+        *,
+        generate: bool = True,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+        ctxlens = ctxlens if ctxlens else [None] * len(requests)
+        conn = TCPConnector(limit=self._concurrent)
+        async with ClientSession(connector=conn) as session:
+            retry_: Callable[..., Awaitable[Any]] = retry(
+                stop=stop_after_attempt(self.max_retries),
+                wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                reraise=True,
+            )(self.amodel_call)
+            # Create tasks for each batch of request
+            tasks = [
+                asyncio.create_task(
+                    retry_(
+                        session=session,
+                        messages=message,
+                        cache_keys=cache_key,
+                        generate=generate,
+                        ctxlens=ctxlen,
+                        **kwargs,
+                    )
+                )
+                for message, cache_key, ctxlen in zip(
+                    chunks(requests, n=self._batch_size),
+                    chunks(cache_keys, n=self._batch_size),
+                    chunks(ctxlens, n=self._batch_size),
+                )
+            ]
+
+            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
+
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        assert (
+            self.tokenizer is not None
+        ), "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        res = []
+
+        def _collate(req: LogLikelihoodInputs):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by=None,
+        )
+        # if concurrent then we'll batch in the async context
+        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                inputs, ctxlens, cache_keys = self.batch_logliklehood_requests([chunk])
+
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(messages=self.create_message(inputs), generate=False)
+                if isinstance(outputs, dict):
+                    outputs = [outputs]
+                for answer_, cache_key in zip(
+                    self.parse_logprobs(
+                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
+                    ),
+                    cache_keys,
+                ):
+                    if answer_ is not None:
+                        res.append(answer_)
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial(
+                                "loglikelihood", cache_key, answer_
+                            )
+                        pbar.update(1)
+        else:
+            inputs, ctxlens, cache_keys = self.batch_logliklehood_requests(chunked)
+            res = itertools.chain.from_iterable(
+                asyncio.run(
+                    self.get_batched_requests(
+                        inputs, cache_keys, generate=False, ctxlens=ctxlens
+                    )
+                )
+            )
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate_gen(_requests):
+            # sort by the length of the non-tokenized contexts
+            return -len(_requests[0])
+
+        # Let the API deal with tokenization
+        requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if self.tokenized_requests:
+            encodings_list = self.tok_encode(
+                requests, add_special_tokens=self.add_bos_token
+            )
+        else:
+            encodings_list = [None] * len(requests)
+        requests = [
+            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
+        ]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate_gen,
+            group_by="gen_kwargs",
+        )
+        chunked = re_ord.get_batched(
+            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
+        )
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(
+                    messages=req,
+                    generate=True,
+                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                )
+                for generated_text, context in zip(
+                    self.parse_generations(
+                        outputs=outputs,
+                        contexts=contexts,
+                    ),
+                    contexts,
+                ):
+                    if generated_text is not None:
+                        res.append(generated_text)
+
+                        # partial caching
+                        if context is not None:
+                            self.cache_hook.add_partial(
+                                "generate_until",
+                                (context, all_gen_kwargs[0]),
+                                generated_text,
+                            )
+                            pbar.update(1)
+        else:
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                results = itertools.chain.from_iterable(
+                    asyncio.run(
+                        self.get_batched_requests(
+                            req,
+                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
+                            generate=True,
+                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                        )
+                    )
+                )
+                res.extend(results)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index 492c41fb..c2e81c87 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -1,478 +1,209 @@
-import copy
 import os
-from collections import defaultdict
-from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple
+from functools import cached_property
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-from tqdm import tqdm
-
-import lm_eval.models.utils
-from lm_eval import utils
-from lm_eval.api.model import LM, TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.models.api_models import TemplateAPI
 from lm_eval.utils import eval_logger
 
 
-def get_result(response) -> Tuple[float, bool]:
-    """Process results from OpenAI API response.
-
-    :param response: dict
-        OpenAI API Response
-    :return:
-        continuation_logprobs: np.array
-            Log probabilities of continuation tokens
-        is_greedy: bool
-            whether argmax matches given continuation exactly
-    """
-    is_greedy = True
-    logprobs = response.logprobs.token_logprobs
-    continuation_logprobs = sum(logprobs)
-
-    for i in range(len(response.logprobs.token_logprobs)):
-        token = response.logprobs.token_logprobs[i]
-        top_tokens = response.logprobs.top_logprobs[i]
-        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
-        if top_token != token:
-            is_greedy = False
-            break
-
-    return continuation_logprobs, is_greedy
-
-
-def oa_completion(client, chat: bool = False, **kwargs):
-    """Query OpenAI API for completion.
-
-    Retry with back-off until they respond
-    """
-    if not find_spec("openai") or not find_spec("tiktoken"):
-        raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
-            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
-        )
-    else:
-        import openai
-
-    def _exception_callback(e: Exception, sleep_time: float) -> None:
-        import traceback
-
-        traceback.print_exc()
-
-    @retry_on_specific_exceptions(
-        on_exceptions=[openai.OpenAIError],
-        max_retries=None,  # retry forever, consider changing
-        on_exception_callback=_exception_callback,
-    )
-    def completion():
-        if chat:
-            return client.chat.completions.create(**kwargs)
-        else:
-            return client.completions.create(**kwargs)
-
-    return completion()
-
-
-@register_model("openai-completions", "local-completions")
-class OpenaiCompletionsLM(TemplateLM):
-    _DEFAULT_MAX_LENGTH = 2048
-
+@register_model("local-completions")
+class LocalCompletionsAPI(TemplateAPI):
     def __init__(
         self,
-        model: str,
-        base_url: str = None,
-        tokenizer: Optional[str] = None,
-        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
-        truncate: bool = False,
-        max_gen_toks: int = 256,
-        batch_size: int = 1,
-        seed: int = 1234,
-        max_length: Optional[int] = None,
-    ) -> None:
-        """
-
-        :param engine: str
-            OpenAI API engine (e.g. gpt-3.5-turbo-instruct)
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-        self.seed = seed
-        try:
-            import openai  # noqa: E401
-            import tiktoken
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
-            )
-        self.model = model
-        self.base_url = base_url
-        self.tokenizer_backend = tokenizer_backend
-        self.truncate = truncate
-        self._batch_size = int(batch_size)
-        self._max_gen_toks = max_gen_toks
-        self._max_length = max_length
-
-        # if we have a local model, use HF tokenizer over tiktoken
-        if self.tokenizer_backend == "huggingface":
-            import transformers  # noqa: E401
-
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                tokenizer if tokenizer else self.model
-            )
-            self.vocab_size = self.tokenizer.vocab
-            self.end_of_text_token_id = self.tokenizer.eos_token
-        elif self.tokenizer_backend == "tiktoken":
-            if self.base_url:
-                eval_logger.warning(
-                    f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
-                    "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
-                )
-
-            self.tokenizer = tiktoken.encoding_for_model(self.model)
-            self.vocab_size = self.tokenizer.n_vocab
-            self.end_of_text_token_id = self.tokenizer.eot_token
-        else:
-            raise ValueError(
-                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
-            )
-
-        # Read from environment variable OPENAI_API_KEY
-        # Set to EMPTY for local
-        openai.api_key = os.environ["OPENAI_API_KEY"]
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()
-
-    @property
-    def eot_token_id(self):
-        return self.end_of_text_token_id
+        base_url=None,
+        tokenizer_backend="huggingface",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
 
-    @property
-    def max_length(self) -> int:
-        if self._max_length:
-            return self._max_length
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        generate=False,
+        gen_kwargs: Optional[dict] = None,
+        **kwargs,
+    ) -> dict:
+        if generate:
+            gen_kwargs.pop("do_sample", False)
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+            temperature = gen_kwargs.pop("temperature", 0)
+            stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+            return {
+                "prompt": messages,
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "stop": stop,
+                **gen_kwargs,
+            }
         else:
-            return self._DEFAULT_MAX_LENGTH
-
-    @property
-    def max_gen_toks(self) -> int:
-        return self._max_gen_toks
-
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-
-    def _loglikelihood_tokens(
-        self, requests, disable_tqdm: bool = False
+            return {
+                "model": self.model,
+                "prompt": messages,
+                "max_tokens": 1,
+                "logprobs": 1,
+                "echo": True,
+            }
+
+    @staticmethod
+    def parse_logprobs(
+        outputs: Union[Dict, List[Dict]],
+        tokens: List[List[int]] = None,
+        ctxlens: List[int] = None,
+        **kwargs,
     ) -> List[Tuple[float, bool]]:
         res = []
-
-        def _collate(x):
-            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
-            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
-            # we care about, and so we need some kind of backup for when it isn't
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        for chunk in tqdm(
-            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            ctxlens = []
-            for cache_key, context_enc, continuation_enc in chunk:
-                # max_length+1 because the API takes up to 2049 tokens, including the first context token
-                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
-                # TODO: the logic is much simpler if we just look at the length of continuation tokens
-                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
-                )
-
-                inps.append(inp)
-                ctxlens.append(ctxlen)
-
-            response = oa_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inps,
-                max_tokens=0,
-                temperature=0.0,
-                logprobs=10,
-                seed=self.seed,
-            )
-
-            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
-                response.choices, ctxlens, chunk
-            ):
-                answer = get_result(resp)
-
-                res.append(answer)
-
-                # partial caching
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-        return re_ord.get_original(res)
-
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        if not requests:
-            return []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choice, ctxlen in zip(out["choices"], ctxlens):
+                assert ctxlen > 0, "Context length must be greater than 0"
+                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
+                tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1]
+                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
+                is_greedy = True
+                for tok, top in zip(tokens, top_logprobs):
+                    if tok != max(top, key=top.get):
+                        is_greedy = False
+                        break
+                res.append((logprobs, is_greedy))
+        return res
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
         res = []
-        requests = [req.args for req in requests]
-
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
-
-        # todo: more intelligent batching for heterogeneous `until`
-        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
-            for context, _ in chunk:
-                context_enc = self.tok_encode(context)
-                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
-                inps.append(inp)
-
-            until = request_args.get("until", ["<|endoftext|>"])
-            request_args["temperature"] = request_args.get("temperature", 0)
-
-            response = oa_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inps,
-                max_tokens=self.max_gen_toks,
-                stop=until,
-                seed=self.seed,
-                **{
-                    k: v
-                    for k, v in request_args.items()
-                    if k not in {"do_sample", "max_gen_toks", "until"}
-                },
-            )
-            for resp, (context, args_) in zip(response.choices, chunk):
-                s = getattr(resp, "text")
-
-                until_ = until
-
-                for term in until_:
-                    if len(term) > 0:
-                        s = s.split(term)[0]
-
-                # partial caching
-                self.cache_hook.add_partial(
-                    "generate_until", (context, {"until": until_}), s
-                )
-
-                res.append(s)
-        return re_ord.get_original(res)
-
-    def _model_call(self, inps):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["text"])
+        return res
 
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override generate_until
-        raise NotImplementedError()
-
-    def loglikelihood_rolling(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
-        loglikelihoods = []
-
-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
+    @property
+    def api_key(self):
+        return os.environ.get("OPENAI_API_KEY", "")
 
-            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
 
-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows,
-                disable_tqdm=True,
+@register_model("local-chat-completions")
+class LocalChatCompletion(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend=None,
+        tokenized_requests=False,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
+        if self._batch_size > 1:
+            eval_logger.warning(
+                "Chat completions does not support batching. Defaulting to batch size 1."
             )
+            self._batch_size = 1
+
+    def _create_payload(
+        self, messages: List[Dict], generate=False, gen_kwargs: dict = None, **kwargs
+    ) -> dict:
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        return {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            **gen_kwargs,
+        }
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["message"]["content"])
+        return res
+
+    def tok_encode(
+        self,
+        string: Union[str, Any],
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> Union[List[str], List[int], Any]:
+        return string
 
-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
-
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-        return loglikelihoods
+    def _loglikelihood_tokens(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
+        )
 
 
-@register_model("openai-chat-completions", "local-chat-completions")
-class OpenaiChatCompletionsLM(LM):
+@register_model(
+    "openai-completions",
+)
+class OpenAICompletionsAPI(LocalCompletionsAPI):
     def __init__(
         self,
-        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
-        base_url: str = None,
-        truncate: bool = False,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
         **kwargs,
-    ) -> None:
-        """
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
 
-        :param model: str
-            Implements an OpenAI-style chat completion API for
-            accessing both OpenAI OR locally-hosted models using
-            HuggingFace Tokenizer
-            OpenAI API model (e.g. gpt-3.5-turbo)
-            using the **gen_kwargs passed on init
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-        try:
-            import openai  # noqa: E401
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the OPENAI_API_KEY environment variable."
             )
-        self.model = model
-        self.base_url = base_url
-        self.truncate = truncate
+        return key
 
-        # Read from environment variable OPENAI_API_KEY
-        # Set to EMPTY for local
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()  # openai.AsyncOpenAI()
+    def _loglikelihood_tokens(self, requests, **kwargs):
+        assert (
+            self.model != "gpt-3.5-turbo"
+        ), "Loglikelihood is not supported for gpt-3.5-turbo"
+        return super()._loglikelihood_tokens(requests, **kwargs)
 
-    @property
-    def max_length(self) -> int:
-        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
-        return 2048
 
-    @property
-    def max_gen_toks(self) -> int:
-        return 256
-
-    @property
-    def batch_size(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        res = defaultdict(list)
-        re_ords = {}
+@register_model("openai-chatcompletions")
+class OpenAIChatCompletion(LocalChatCompletion):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/chat/completions",
+        tokenizer_backend=None,
+        tokenized_requests=False,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
 
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
-        for key, reqs in grouper.get_grouped().items():
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer(
-                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the OPENAI_API_KEY environment variable."
             )
-
-        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
-        for key, re_ord in re_ords.items():
-            # n needs to be 1 because messages in
-            # chat completion are not batch but
-            # is regarded as a single conversation.
-            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                inps = [{"role": "user", "content": context} for context in contexts]
-
-                gen_kwargs = all_gen_kwargs[0]
-                until = None
-                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
-                    if "do_sample" in kwargs.keys():
-                        kwargs.pop("do_sample")
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [until]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
-                            )
-                        kwargs["stop"] = until
-                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
-                else:
-                    raise ValueError(
-                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
-                    )
-
-                response = oa_completion(
-                    client=self.client,
-                    chat=True,
-                    messages=inps,
-                    model=self.model,
-                    **kwargs,
-                )
-
-                for resp, (context, args_) in zip(response.choices, chunk):
-                    s = resp.message.content
-
-                    if until is not None:
-                        for term in until:
-                            if len(term) > 0:
-                                s = s.split(term)[0]
-
-                    res[key].append(s)
-
-                    self.cache_hook.add_partial(
-                        "generate_until", (context, {"until": until}), s
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
-
-        pbar.close()
-
-        return grouper.get_original(res)
-
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
-
-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
+        return key
diff --git a/pyproject.toml b/pyproject.toml
index 08cc47f1..e9f52239 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,7 +57,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
 Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 
 [project.optional-dependencies]
-anthropic = ["anthropic"]
+api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
@@ -67,7 +67,6 @@ neuronx = ["optimum[neuronx]"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
-openai = ["openai==1.3.9", "tiktoken"]
 optimum = ["optimum[openvino]"]
 promptsource = ["promptsource>=0.2.3"]
 sentencepiece = ["sentencepiece>=0.1.98"]
diff --git a/tests/models/test_api.py b/tests/models/test_api.py
new file mode 100644
index 00000000..ec1a5d72
--- /dev/null
+++ b/tests/models/test_api.py
@@ -0,0 +1,141 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+
+
+@pytest.fixture
+def api():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
+    )
+
+
+@pytest.fixture
+def api_tokenized():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com",
+        model="EleutherAI/pythia-1b",
+        tokenizer_backend="huggingface",
+    )
+
+
+def test_create_payload_generate(api):
+    messages = ["Generate a story"]
+    gen_kwargs = {
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "until": ["The End"],
+        "do_sample": True,
+    }
+    payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
+
+    assert payload == {
+        "prompt": ["Generate a story"],
+        "model": "gpt-3.5-turbo",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "stop": ["The End"],
+    }
+
+
+def test_create_payload_loglikelihood(api):
+    messages = ["The capital of France is"]
+    payload = api._create_payload(messages, generate=False, gen_kwargs=None)
+
+    assert payload == {
+        "model": "gpt-3.5-turbo",
+        "prompt": ["The capital of France is"],
+        "max_tokens": 1,
+        "logprobs": 1,
+        "echo": True,
+    }
+
+
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            ["Hello, how are"],
+            True,
+            {"max_gen_toks": 100, "temperature": 0.7},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 100,
+                "temperature": 0.7,
+                "stop": ["<|endoftext|>"],
+            },
+        ),
+        (
+            ["Hello, how are", "you"],
+            True,
+            {},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 256,
+                "temperature": 0,
+                "stop": ["<|endoftext|>"],
+            },
+        ),
+    ],
+)
+def test_model_generate_call_usage(
+    api, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+
+        # Act
+        result = api.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}
+
+
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            [[1, 2, 3, 4, 5]],
+            False,
+            None,
+            {
+                "model": "EleutherAI/pythia-1b",
+                "prompt": [[1, 2, 3, 4, 5]],
+                "max_tokens": 1,
+                "logprobs": 1,
+                "echo": True,
+            },
+        ),
+    ],
+)
+def test_model_tokenized_call_usage(
+    api_tokenized, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+
+        # Act
+        result = api_tokenized.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}
-- 
GitLab