removed unused files for now

f7f298ee · lintangsutawika · c0d5a660 · 12bc8fce · f7f298ee · f7f298ee
Commit f7f298ee authored Jan 25, 2024 by lintangsutawika
20 changed files
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -4,7 +4,7 @@ The `lm-evaluation-harness` is meant to be an extensible and flexible framework
 These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations.
-While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
+While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
 If your intended task relies on features beyond what are described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord.
@@ -380,4 +380,4 @@ task:
        ignore_punctuation: true
 ```
-Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/benchmarks/`
+Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -10,8 +10,7 @@ from typing import Union
 import numpy as np
 from lm_eval import evaluator, utils
-from lm_eval.api.registry import ALL_TASKS
+from lm_eval.tasks import TaskManager
-from lm_eval.tasks import include_path, initialize_tasks
 from lm_eval.utils import make_table
@@ -156,44 +155,46 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    eval_logger.info(f"Verbosity set to {args.verbosity}")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    initialize_tasks(args.verbosity)
+    # initialize_tasks(args.verbosity)
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
    if args.limit:
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-        include_path(args.include_path)
    if args.tasks is None:
-        task_names = ALL_TASKS
+        eval_logger.error("Need to specify task to evaluate.")
+        sys.exit()
    elif args.tasks == "list":
        eval_logger.info(
-            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
+            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks()))
        )
-        sys.exit()
    else:
        if os.path.isdir(args.tasks):
            import glob
-            task_names = []
+            loaded_task_list = []
            yaml_path = os.path.join(args.tasks, "*.yaml")
            for yaml_file in glob.glob(yaml_path):
                config = utils.load_yaml_config(yaml_file)
-                task_names.append(config)
+                loaded_task_list.append(config)
        else:
-            tasks_list = args.tasks.split(",")
+            input_task_list = args.tasks.split(",")
-            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
+            loaded_task_list = utils.pattern_match(
-            for task in [task for task in tasks_list if task not in task_names]:
+                input_task_list, task_manager.all_tasks()
+            )
+            for task in [
+                task for task in input_task_list if task not in loaded_task_list
+            ]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
-                    task_names.append(config)
+                    loaded_task_list.append(config)
            task_missing = [
                task
-                for task in tasks_list
+                for task in input_task_list
-                if task not in task_names and "*" not in task
+                if task not in loaded_task_list and "*" not in task
            ]  # we don't want errors if a wildcard ("*") task name was used
            if task_missing:
@@ -226,12 +227,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    elif args.log_samples and not args.output_path:
        assert args.output_path, "Specify --output_path"
-    eval_logger.info(f"Selected Tasks: {task_names}")
+    eval_logger.info(f"Selected Tasks: {loaded_task_list}")
+    eval_logger.info("Loading selected tasks...")
+    all_tasks = task_manager.load_task_or_group(loaded_task_list)
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
-        tasks=task_names,
+        tasks=all_tasks,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
        max_batch_size=args.max_batch_size,

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -111,6 +111,8 @@ class TaskConfig(dict):
                }
        # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
+        # if self.dataset_kwargs is None:
+        #     self.dataset_kwargs = {"trust_remote_code": True}
    def __getitem__(self, item):
        return getattr(self, item)
@@ -118,7 +120,7 @@ class TaskConfig(dict):
    def __setitem__(self, item, value):
        return setattr(self, item, value)
-    def to_dict(self):
+    def to_dict(self, keep_callable=False):
        """dumps the current config as a dictionary object, as a printable format.
        null fields will not be printed.
        Used for dumping results alongside full task configuration
@@ -134,8 +136,11 @@ class TaskConfig(dict):
            if v is None:
                cfg_dict.pop(k)
            elif isinstance(v, Callable):
-                # TODO: this should handle Promptsource template objects as a separate case?
+                if keep_callable:
-                cfg_dict[k] = str(v)
+                    cfg_dict[k] = v
+                else:
+                    # TODO: this should handle Promptsource template objects as a separate case?
+                    cfg_dict[k] = str(v)
        return cfg_dict

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -38,6 +38,7 @@ def simple_evaluate(
    write_out: bool = False,
    log_samples: bool = True,
    gen_kwargs: str = None,
+    weight_by_size: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -46,8 +47,8 @@ def simple_evaluate(
    :param model_args: Optional[str]
        String arguments for each model class, see LM.create_from_arg_string.
        Ignored if `model` argument is a LM object.
-    :param tasks: list[Union[str, Task]]
+    :param tasks: list[Task]
-        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+        List of Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
    :param num_fewshot: int
        Number of examples in few-shot context
    :param batch_size: int or str, optional
@@ -119,7 +120,7 @@ def simple_evaluate(
            + ".db",
        )
-    task_dict = lm_eval.tasks.get_task_dict(tasks)
+    task_dict = tasks
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
@@ -155,6 +156,7 @@ def simple_evaluate(
        decontamination_ngrams_path=decontamination_ngrams_path,
        write_out=write_out,
        log_samples=log_samples,
+        weight_by_size=weight_by_size,
    )
    if lm.rank == 0:
@@ -192,6 +194,7 @@ def evaluate(
    decontamination_ngrams_path=None,
    write_out: bool = False,
    log_samples: bool = True,
+    weight_by_size: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -479,12 +482,14 @@ def evaluate(
                        if "alias" in metrics:
                            metrics.pop("alias")
-                        current_size = metrics.pop("samples")
                        # TODO: There should be a way for users
                        #       to toggle between weighted and
                        #       unweighted averaging
-                        # For unweighted averaging, use:
+                        if weight_by_size:
-                        #     current_size = 1
+                            current_size = metrics.pop("samples")
+                        else:
+                            metrics.pop("samples")
+                            current_size = 1
                        # TODO: Tasks like brier score for individual
                        # tasks have no stderr since the score is

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import os
+import abc
 import yaml
+import collections
+from functools import partial, lru_cache
 from typing import List, Union, Dict
 from lm_eval import utils
@@ -10,7 +14,6 @@ from lm_eval.api.registry import (
    register_group,
    TASK_REGISTRY,
    GROUP_REGISTRY,
-    ALL_TASKS,
 )
 import logging
@@ -29,272 +32,203 @@ from .scrolls.task import (
 eval_logger = utils.eval_logger
-def register_configurable_task(config: Dict[str, str]) -> int:
+class TaskManager(abc.ABC):
-    SubClass = type(
-        config["task"] + "ConfigurableTask",
-        (ConfigurableTask,),
-        {"CONFIG": TaskConfig(**config)},
-    )
-    if "task" in config:
+    def __init__(
-        task_name = "{}".format(config["task"])
+        self,
-        register_task(task_name)(SubClass)
+        verbosity="INFO",
+        include_path=None
+        ) -> None:
-    if "group" in config:
+        self.verbosity = verbosity
-        if config["group"] == config["task"]:
+        self.include_path = include_path
-            raise ValueError("task and group name cannot be the same")
+        self.logger = eval_logger.setLevel(getattr(logging, f"{verbosity}"))
-        elif type(config["group"]) == str:
-            group_name = [config["group"]]
+        self.ALL_TASKS = self.initialize_tasks(
-        else:
+            include_path=include_path
-            group_name = config["group"]
-        for group in group_name:
-            register_group(group)(SubClass)
-    return 0
-def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
-    group = config["group"]
-    all_task_list = config["task"]
-    config_list = [task for task in all_task_list if type(task) != str]
-    task_list = [task for task in all_task_list if type(task) == str]
-    for task_config in config_list:
-        base_config = {}
-        task_name_config = {}
-        if "task" in task_config:
-            task_name = task_config["task"]
-            if task_name in ALL_TASKS:
-                task_obj = get_task_dict(task_name)[task_name]
-                if type(task_obj) == tuple:
-                    _, task_obj = task_obj
-                if task_obj is not None:
-                    base_config = task_obj._config.to_dict()
-                    task_name_config["task"] = f"{group}_{task_name}"
-        task_config = utils.load_yaml_config(yaml_path, task_config)
-        var_configs = check_prompt_config(
-            {
-                **base_config,
-                **task_config,
-                **{"group": group},
-                **task_name_config,
-            },
-            yaml_path=os.path.dirname(yaml_path),
-        )
-        for config in var_configs:
-            register_configurable_task(config)
-    task_names = utils.pattern_match(task_list, ALL_TASKS)
-    for task in task_names:
-        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
-            if group in GROUP_REGISTRY:
-                GROUP_REGISTRY[group].append(task)
-            else:
-                GROUP_REGISTRY[group] = [task]
-                ALL_TASKS.add(group)
-    return 0
-def check_prompt_config(
-    config: Dict[str, str], yaml_path: str = None
-) -> List[Dict[str, str]]:
-    all_configs = []
-    if "use_prompt" in config:
-        prompt_list = prompts.load_prompt_list(
-            use_prompt=config["use_prompt"],
-            dataset_name=config["dataset_path"],
-            subset_name=config["dataset_name"] if "dataset_name" in config else None,
-            yaml_path=yaml_path,
-        )
-        for idx, prompt_variation in enumerate(prompt_list):
-            all_configs.append(
-                {
-                    **config,
-                    **{"use_prompt": prompt_variation},
-                    **{
-                        "task": "_".join(
-                            [
-                                config["task"]
-                                if "task" in config
-                                else get_task_name_from_config(config),
-                                prompt_variation.split("/")[-1]
-                                if ".yaml" in prompt_variation
-                                else prompt_variation,
-                            ]
-                        )
-                    },
-                    **{"output_type": "generate_until"},
-                }
            )
-    else:
-        all_configs.append(config)
-    return all_configs
-def get_task_name_from_config(task_config: Dict[str, str]) -> str:
-    if "dataset_name" in task_config:
-        return "{dataset_path}_{dataset_name}".format(**task_config)
-    else:
-        return "{dataset_path}".format(**task_config)
-def include_task_folder(task_dir: str, register_task: bool = True) -> None:
-    """
-    Calling this function
-    """
-    # Track whether any tasks failed during loading
-    import_fail = False
-    for root, subdirs, file_list in os.walk(task_dir):
-        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-        for f in file_list:
-            if f.endswith(".yaml"):
-                yaml_path = os.path.join(root, f)
-                try:
-                    config = utils.load_yaml_config(yaml_path)
-                    if "task" not in config:
-                        continue
-                    all_configs = check_prompt_config(
-                        config, yaml_path=os.path.dirname(yaml_path)
-                    )
-                    for config in all_configs:
-                        if register_task:
-                            if type(config["task"]) == str:
-                                register_configurable_task(config)
-                        else:
-                            if type(config["task"]) == list:
-                                register_configurable_group(config, yaml_path)
-                # Log this silently and show it only when
-                # the user defines the appropriate verbosity.
-                except (ImportError, ModuleNotFoundError) as e:
-                    import_fail = True
-                    eval_logger.debug(
-                        f"{yaml_path}: {e}. Config will not be added to registry."
-                    )
-                except Exception as error:
-                    import traceback
-                    eval_logger.warning(
-                        "Unexpected error loading config in\n"
-                        f"                                 {yaml_path}\n"
-                        "                                 Config will not be added to registry\n"
-                        f"                                 Error: {error}\n"
-                        f"                                 Traceback: {traceback.format_exc()}"
-                    )
-    if import_fail:
-        eval_logger.warning(
-          "Some tasks could not be loaded due to missing dependencies."
-          " Run with `--verbosity DEBUG` for full details."
-          )
-    return 0
-def include_path(task_dir):
-    include_task_folder(task_dir)
-    # Register Benchmarks after all tasks have been added
-    include_task_folder(task_dir, register_task=False)
-    return 0
-def initialize_tasks(verbosity="INFO"):
-    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
-    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-    include_path(task_dir)
-def get_task(task_name, config):
-    try:
-        return TASK_REGISTRY[task_name](config=config)
-    except KeyError:
-        eval_logger.info("Available tasks:")
-        eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
-        raise KeyError(f"Missing task {task_name}")
-def get_task_name_from_object(task_object):
-    for name, class_ in TASK_REGISTRY.items():
-        if class_ is task_object:
-            return name
-    # TODO: scrap this
-    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
-    return (
-        task_object.EVAL_HARNESS_NAME
-        if hasattr(task_object, "EVAL_HARNESS_NAME")
-        else type(task_object).__name__
-    )
-# TODO: pass num_fewshot and other cmdline overrides in a better way
-def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
-    config = {**kwargs}
-    task_name_from_registry_dict = {}
-    task_name_from_config_dict = {}
-    task_name_from_object_dict = {}
-    if type(task_name_list) != list:
-        task_name_list = [task_name_list]
-    for task_element in task_name_list:
-        if isinstance(task_element, str):
-            if task_element in GROUP_REGISTRY:
-                group_name = task_element
-                for task_name in GROUP_REGISTRY[task_element]:
-                    if task_name not in task_name_from_registry_dict:
-                        task_obj = get_task_dict(task_name)
-                        if task_name in task_obj.keys():
-                            task_dict = {
-                                task_name: (group_name, task_obj[task_name]),
-                            }
-                        else:
-                            task_dict = {
-                                task_name: (group_name, None),
-                                **task_obj,
-                            }
-                        task_name_from_registry_dict = {
+    def initialize_tasks(self, include_path=None):
-                            **task_name_from_registry_dict,
-                            **task_dict,
+        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_path is not None:
+            if isinstance(include_path, str):
+                include_path = [include_path]
+            all_paths.extend(include_path)
+        ALL_TASKS = {}
+        for task_dir in all_paths:
+            tasks = self._get_task_and_group(task_dir)
+            ALL_TASKS = {**tasks, **ALL_TASKS}
+        return ALL_TASKS
+    def all_tasks(self):
+        return sorted(list(self.ALL_TASKS.keys()))
+    def _name_is_registered(self, name):
+        if name in self.ALL_TASKS:
+            return True
+        return False
+    def _name_is_task(self, name):
+        if self.ALL_TASKS[name]["type"] == "task":
+            return True
+        return False
+    def _config_is_task(self, config):
+        if list(config.keys()) == ["group", "task"]:
+            return False
+        return True
+    def _get_yaml_path(self, name):
+        assert name in self.ALL_TASKS
+        return self.ALL_TASKS[name]["yaml_path"]
+    def _get_config(self, name):
+        assert name in self.ALL_TASKS
+        yaml_path = self._get_yaml_path(name)
+        return utils.load_yaml_config(yaml_path)
+    def _get_tasklist(self, name):
+        assert self._name_is_task(name) == False
+        return self.ALL_TASKS[name]["task"]
+    @lru_cache(None)
+    def _load_individual_task_or_group(self, name_or_config: Union[str, dict] = None, parent_name: str = None) -> ConfigurableTask:
+        def load_task(config, task, group=None):
+            task_object = ConfigurableTask(config=config)
+            if group is not None:
+                task_object = (group, task_object)
+            return {task: task_object}
+        if isinstance(name_or_config, str):
+            if self._name_is_task(name_or_config):
+                task_config = self._get_config(name_or_config)
+                return load_task(task_config, task=name_or_config, group=parent_name)
+            else:
+                group_name = name_or_config
+                subtask_list = self._get_tasklist(name_or_config)
+                if subtask_list == -1:
+                    subtask_list = self._get_config(name_or_config)["task"]
+        elif isinstance(name_or_config, dict):
+            if self._config_is_task(name_or_config):
+                task_name = name_or_config["task"]
+                if self._name_is_registered(task_name):
+                    base_task_config = self._get_config(task_name)
+                    task_config={
+                            **base_task_config,
+                            **name_or_config,
                        }
+                else:
+                    task_config = name_or_config
+                return load_task(task_config, task=name_or_config, group=parent_name)
            else:
-                task_name = task_element
+                group_name = name_or_config["group"]
-                if task_name not in task_name_from_registry_dict:
+                subtask_list = name_or_config["task"]
-                    task_name_from_registry_dict = {
-                        **task_name_from_registry_dict,
+        if self._get_yaml_path(group_name) == -1:
-                        task_name: get_task(task_name=task_element, config=config),
+            all_subtasks = {group_name: (parent_name, None)}
-                    }
+        else:
+            all_subtasks = {}
-        elif isinstance(task_element, dict):
-            task_element.update(config)
+        fn = partial(self._load_individual_task_or_group, parent_name=group_name)
-            task_name_from_config_dict = {
+        all_subtasks = {**all_subtasks, **dict(collections.ChainMap(*map(fn, subtask_list)))}
-                **task_name_from_config_dict,
+        return all_subtasks
-                get_task_name_from_config(task_element): ConfigurableTask(
-                    config=task_element
-                ),
+    def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
-            }
+        if isinstance(task_list, str):
-        elif isinstance(task_element, Task):
+            task_list = [task_list]
-            task_name_from_object_dict = {
-                **task_name_from_object_dict,
+        all_loaded_tasks = dict(
-                get_task_name_from_object(task_element): task_element,
+            collections.ChainMap(
-            }
+                *map(
+                    self._load_individual_task_or_group,
-    assert set(task_name_from_registry_dict.keys()).isdisjoint(
+                    task_list
-        set(task_name_from_object_dict.keys())
+                )
-    )
+            )
-    return {
+        )
-        **task_name_from_registry_dict,
+        return all_loaded_tasks
-        **task_name_from_config_dict,
-        **task_name_from_object_dict,
+    def _get_task_and_group(self, task_dir: str):
-    }
+        tasks_and_groups = collections.defaultdict()
+        for root, _, file_list in os.walk(task_dir):
+            for f in file_list:
+                if f.endswith(".yaml"):
+                    yaml_path = os.path.join(root, f)
+                    config = utils.simple_load_yaml_config(yaml_path)
+                    if list(config.keys()) == ["group", "task"]:
+                        # This is a group config
+                        tasks_and_groups[config["group"]] = {
+                            "type": "group",
+                            "task": -1, # This signals that
+                                        # we don't need to know
+                                        # the task list for indexing
+                                        # as it can be loaded
+                                        # when called.
+                            "yaml_path": yaml_path,
+                        }
+                    else:
+                        # This is a task config
+                        task = config["task"]
+                        tasks_and_groups[task] = {
+                            "type": "task",
+                            "yaml_path": yaml_path,
+                            }
+                        if "group" in config:
+                            groups = config["group"]
+                            if isinstance(config["group"], str):
+                                groups = [groups]
+                            for group in groups:
+                                if group not in tasks_and_groups:
+                                    tasks_and_groups[group] = {
+                                        "type": "group",
+                                        "task": [task],
+                                        "yaml_path": -1,
+                                    }
+                                else:
+                                    tasks_and_groups[group]["task"].append(task)
+        return tasks_and_groups
+# def check_prompt_config(
+#     config: Dict[str, str], yaml_path: str = None
+# ) -> List[Dict[str, str]]:
+#     all_configs = []
+#     if "use_prompt" in config:
+#         prompt_list = prompts.load_prompt_list(
+#             use_prompt=config["use_prompt"],
+#             dataset_name=config["dataset_path"],
+#             subset_name=config["dataset_name"] if "dataset_name" in config else None,
+#             yaml_path=yaml_path,
+#         )
+#         for idx, prompt_variation in enumerate(prompt_list):
+#             all_configs.append(
+#                 {
+#                     **config,
+#                     **{"use_prompt": prompt_variation},
+#                     **{
+#                         "task": "_".join(
+#                             [
+#                                 config["task"]
+#                                 if "task" in config
+#                                 else get_task_name_from_config(config),
+#                                 prompt_variation.split("/")[-1]
+#                                 if ".yaml" in prompt_variation
+#                                 else prompt_variation,
+#                             ]
+#                         )
+#                     },
+#                     **{"output_type": "generate_until"},
+#                 }
+#             )
+#     else:
+#         all_configs.append(config)
+#     return all_configs
--- a/lm_eval/tasks/benchmarks/flan/_flan_anli_yaml
+++ b/lm_eval/tasks/benchmarks/flan/_flan_anli_yaml
-group: flan_anli
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: r1
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: yaml_templates/held_in_template_yaml
-    task: r2
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: yaml_templates/held_in_template_yaml
-    task: r3
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r3
--- a/lm_eval/tasks/benchmarks/flan/_flan_arc_yaml
+++ b/lm_eval/tasks/benchmarks/flan/_flan_arc_yaml
-group: flan_arc
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/_flan_boolq_yaml
+++ b/lm_eval/tasks/benchmarks/flan/_flan_boolq_yaml
-group: flan_boolq
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: prompt_templates/boolq.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/_flan_rte_yaml
+++ b/lm_eval/tasks/benchmarks/flan/_flan_rte_yaml
-group: flan_rte
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: prompt_templates/rte.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
-group: flan_cot
-task:
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: gsmk
-    dataset_name: boolq
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: EleutherAI/asdiv
-    use_prompt: promptsource:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
-group: flan_held_in
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: r1
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: yaml_templates/held_in_template_yaml
-    task: r2
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: yaml_templates/held_in_template_yaml
-    task: r3
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r3
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: prompt_templates/boolq.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: prompt_templates/rte.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
-group: flan_held_out
-task:
-  # BBH
-  - bbh_zeroshot
-  - bbh_fewshot
-  - bbh_cot_fewshot
-  - bbh_cot_zeroshot
-  # MMLU
-  - mmlu
-  - mmlu_flan_n_shot_generative
-  - mmlu_flan_n_shot_loglikelihood
-  - mmlu_flan_cot_zeroshot
-  - mmlu_flan_cot_fewshot
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-1":
-    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-2":
-    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-3":
-    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-4":
-    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-5":
-    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-6":
-    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-1":
-    doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-2":
-    doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-3":
-    doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-4":
-    doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-5":
-    doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-6":
-    doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-7":
-    doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-8":
-    # doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-9":
-    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-7":
-    doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-8":
-    doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
-group: flan-cot
-output_type: generate_until
-validation_split: validation
-doc_to_target: "{{answer}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-generation_kwargs:
-  until:
-    - "\n\n"
-  do_sample: false
-  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
-metadata:
-  version: 1.0
--- a/lm_eval/tasks/benchmarks/test.yaml
+++ b/lm_eval/tasks/benchmarks/test.yaml
+group: grouptest
+task:
+  - boolq
+  - group: arc_stuff
+    task:
+      - arc_challenge
+      - glue
+      - task: arc_easy
+        metric_list:
+          - metric: acc
+        num_fewshot: 3
+  # - task: mmlu
+  #   num_fewshot: 2
--- a/lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05/a/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05/a/_template_yaml
 dataset_path: hails/mmlu_no_train
 test_split: test
 fewshot_split: dev
+fewshot_config:
+  sampler: first_n
 output_type: multiple_choice
 doc_to_text: !function ../../../styles.template_05
 doc_to_choice: !function ../../../styles.choice_05a

--- a/lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05_generative/a/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/output_variation/style_05_generative/a/_mmlu.yaml
+group: mmlu_alt_ov_05a_generative
+task:
+  - mmlu_alt_ov_05a_stem_generative
+  - mmlu_alt_ov_05a_other_generative
+  - mmlu_alt_ov_05a_social_sciences_generative
+  - mmlu_alt_ov_05a_humanities_generative