fix merge conflicts

f66fc06f · haileyschoelkopf · b13753cd · d714fc95 · f66fc06f · f66fc06f
Commit f66fc06f authored Feb 01, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -108,8 +108,8 @@ class HFLM(LM):
            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
            self._model = pretrained
            self._device = self._model.device
-
            self._config = self._model.config
+            gpus = 0

            if tokenizer:
                assert isinstance(
@@ -200,8 +200,9 @@ class HFLM(LM):
            )

        # access self._model through self.model property outside this method
-        self.model.eval()
-        self.model.tie_weights()
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()

        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
            # TODO: can remove this whole snippet except in the mps case, perhaps?
@@ -238,6 +239,16 @@ class HFLM(LM):
            if self.config.model_type == "qwen":
                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
            else:
                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

@@ -361,7 +372,7 @@ class HFLM(LM):

    def _get_backend(
        self,
-        config: transformers.AutoConfig,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
        trust_remote_code: Optional[bool] = False,
    ) -> None:
@@ -602,8 +613,7 @@ class HFLM(LM):
                    (batch_size, max_length), device=self.device
                ).long()
            for _ in range(5):
-                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)
-                out = out  # Identity process so that it passes pre-commit
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841

            return batch_size

@@ -705,10 +715,14 @@ class HFLM(LM):
                return self.model(inps).logits

    def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        # we require users to pass do_sample=True explicitly
-        # for non-greedy gen. This should be reevaluated when considering beam search.
-        if "do_sample" not in generation_kwargs:
-            generation_kwargs["do_sample"] = False
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
        # build stopping criteria
        stopping_criteria = stop_sequences_criteria(
            self.tokenizer, stop, context.shape[1], context.shape[0]
@@ -1045,6 +1059,7 @@ class HFLM(LM):
            return -len(toks), x[0]

        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
            print("Passed argument batch_size = auto. Detecting largest batch size")
@@ -1089,7 +1104,7 @@ class HFLM(LM):
                        )
            else:
                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
            if not until:
                until = [self.tok_decode(self.eot_token_id)]

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
+from importlib.util import find_spec
+from pathlib import Path
+
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("openvino")
+class OptimumLM(HFLM):
+    """
+    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+    Intel® architectures using OpenVINO™ runtime.
+    """
+
+    def __init__(
+        self,
+        device="cpu",
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # optimum currently only supports causal models
+            assert (
+                kwargs["backend"] == "causal"
+            ), "Currently, only OVModelForCausalLM is supported."
+
+        self.openvino_device = device
+
+        super().__init__(
+            device=self.openvino_device,
+            backend=kwargs.get("backend", "causal"),
+            **kwargs,
+        )
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise Exception(
+                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
+            )
+        else:
+            from optimum.intel.openvino import OVModelForCausalLM
+
+        model_kwargs = kwargs if kwargs else {}
+        model_file = Path(pretrained) / "openvino_model.xml"
+        if model_file.exists():
+            export = False
+        else:
+            export = True
+        kwargs["ov_config"] = {
+            "PERFORMANCE_HINT": "LATENCY",
+            "NUM_STREAMS": "1",
+            "CACHE_DIR": "",
+        }
+
+        self._model = OVModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            export=export,
+            device=self.openvino_device.upper(),
+            **model_kwargs,
+        )
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -170,18 +170,12 @@ class VLLM(LM):
        stop: Optional[List[str]] = None,
        **kwargs,
    ):
-        if "do_sample" in kwargs.keys():
-            kwargs.pop("do_sample")
        if generate:
-            # hf defaults
-            kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
-            kwargs["spaces_between_special_tokens"] = kwargs.get(
-                "spaces_between_special_tokens", False
-            )
+            kwargs = self.modify_gen_kwargs(kwargs)
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=2, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1
            )
        if self.data_parallel_size > 1:
            requests = [list(x) for x in divide(requests, self.data_parallel_size)]
@@ -438,3 +432,16 @@ class VLLM(LM):
                    break

        return continuation_logprobs, is_greedy
+
+    @staticmethod
+    def modify_gen_kwargs(kwargs: dict) -> dict:
+        # sampling_params
+        do_sample = kwargs.pop("do_sample", None)
+        if do_sample is False or "temperature" not in kwargs:
+            kwargs["temperature"] = 0.0
+        # hf defaults
+        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
+        kwargs["spaces_between_special_tokens"] = kwargs.get(
+            "spaces_between_special_tokens", False
+        )
+        return kwargs
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -117,7 +117,7 @@ class PromptString:

        # TODO need a way to process doc_to_choice
        if "doc_to_choice" in self.prompt_string:
-            raise "Not yet implemented to accept doc_to_choice"
+            raise Exception("Not yet implemented to accept doc_to_choice")

        text_string = utils.apply_template(doc_to_text, doc)
        target_string = utils.apply_template(doc_to_target, doc)

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import os
-import yaml
+import abc
+import collections
+
+from functools import partial
 from typing import List, Union, Dict

 from lm_eval import utils
-from lm_eval import prompts
-from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
-from lm_eval.api.registry import (
-    register_task,
-    register_group,
-    TASK_REGISTRY,
-    GROUP_REGISTRY,
-    ALL_TASKS,
-)
+from lm_eval.api.task import Task, ConfigurableTask

 import logging

-# import python tasks
-from .squadv2.task import SQuAD2
-from .scrolls.task import (
-    QuALITY,
-    NarrativeQA,
-    ContractNLI,
-    GovReport,
-    SummScreenFD,
-    QMSum,
-)
-
-eval_logger = utils.eval_logger
-
-
-def register_configurable_task(config: Dict[str, str]) -> int:
-    SubClass = type(
-        config["task"] + "ConfigurableTask",
-        (ConfigurableTask,),
-        {"CONFIG": TaskConfig(**config)},
-    )

-    if "task" in config:
-        task_name = "{}".format(config["task"])
-        register_task(task_name)(SubClass)
+class TaskManager:
+    """TaskManager indexes all tasks from the default `lm_eval/tasks/`
+    and an optional directory if provided.

-    if "group" in config:
-        if config["group"] == config["task"]:
-            raise ValueError("task and group name cannot be the same")
-        elif type(config["group"]) == str:
-            group_name = [config["group"]]
-        else:
-            group_name = config["group"]
+    """
+    def __init__(
+        self,
+        verbosity="INFO",
+        include_path=None
+        ) -> None:
+
+        self.verbosity = verbosity
+        self.include_path = include_path
+        self.logger = utils.eval_logger
+        self.logger.setLevel(getattr(logging, f"{verbosity}"))
+
+        self._task_index = self.initialize_tasks(
+            include_path=include_path
+            )
+        self._all_tasks = sorted(list(self._task_index.keys()))

-        for group in group_name:
-            register_group(group)(SubClass)
+        self.task_group_map = collections.defaultdict(list)

-    return 0
+    def initialize_tasks(self, include_path: str = None):
+        """Creates an dictionary of tasks index.
+
+        :param include_path: str = None
+            An additional path to be searched for tasks
+
+        :return
+            Dictionary of task names as key and task metadata
+        """
+        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_path is not None:
+            if isinstance(include_path, str):
+                include_path = [include_path]
+            all_paths.extend(include_path)

+        task_index = {}
+        for task_dir in all_paths:
+            tasks = self._get_task_and_group(task_dir)
+            task_index = {**tasks, **task_index}

-def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
-    group = config["group"]
-    all_task_list = config["task"]
-    config_list = [task for task in all_task_list if type(task) != str]
-    task_list = [task for task in all_task_list if type(task) == str]
-
-    for task_config in config_list:
-
-        base_config = {}
-        task_name_config = {}
-        if "task" in task_config:
-            task_name = task_config["task"]
-            if task_name in ALL_TASKS:
-                task_obj = get_task_dict(task_name)[task_name]
-                if type(task_obj) == tuple:
-                    _, task_obj = task_obj
-
-                if task_obj is not None:
-                    base_config = task_obj._config.to_dict(keep_callable=True)
-                    task_name_config["task"] = f"{group}_{task_name}"
-
-        task_config = utils.load_yaml_config(yaml_path, task_config)
-        var_configs = check_prompt_config(
-            {
-                **base_config,
-                **task_config,
-                **{"group": group},
-                **task_name_config,
-            },
-            yaml_path=os.path.dirname(yaml_path),
+        return task_index
+
+    @property
+    def all_tasks(self):
+        return self._all_tasks
+
+    @property
+    def task_index(self):
+        return self._task_index
+
+    def match_tasks(self, task_list):
+        return utils.pattern_match(
+            task_list, self.all_tasks
        )
-        for config in var_configs:
-            register_configurable_task(config)
-
-    task_names = utils.pattern_match(task_list, ALL_TASKS)
-    for task in task_names:
-        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
-            if group in GROUP_REGISTRY:
-                GROUP_REGISTRY[group].append(task)
+
+    def _name_is_registered(self, name):
+        if name in self.all_tasks:
+            return True
+        return False
+
+    def _name_is_task(self, name):
+        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+            return True
+        return False
+
+    def _name_is_group(self, name):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "group"):
+            return True
+        return False
+
+    def _name_is_python_task(self, name):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "python_task"):
+            return True
+        return False
+
+    def _config_is_task(self, config):
+        if ("task" in config) and isinstance(config["task"], str):
+            return True
+        return False
+
+    def _config_is_group(self, config):
+        if ("task" in config) and isinstance(config["task"], list):
+            return True
+        return False
+
+    def _config_is_python_task(self, config):
+        if "class" in config:
+            return True
+        return False
+
+    def _get_yaml_path(self, name):
+        assert name in self.task_index
+        return self.task_index[name]["yaml_path"]
+
+    def _get_config(self, name):
+        assert name in self.task_index
+        yaml_path = self._get_yaml_path(name)
+        if yaml_path == -1:
+            return {}
+        else:
+            return utils.load_yaml_config(yaml_path, mode="full")
+
+    def _get_tasklist(self, name):
+        assert self._name_is_task(name) == False
+        return self.task_index[name]["task"]
+
+    def _process_alias(self, config, group=None):
+        # If the group is not the same as the original
+        # group which the group alias was intended for,
+        # Set the group_alias to None instead.
+        if ("group_alias" in config) and ("group" in config) and group is not None:
+            if config["group"] != group:
+                config["group_alias"] = None
+        return config
+
+    def _load_individual_task_or_group(
+            self,
+            name_or_config: Union[str, dict] = None,
+            parent_name: str = None,
+            update_config: dict = None,
+            yaml_path: str = None,
+        ) -> ConfigurableTask:
+        def load_task(config, task, group=None, yaml_path=None):
+            if "include" in config:
+                assert yaml_path is not None
+                config.update(
+                    utils.load_yaml_config(
+                        yaml_path,
+                        yaml_config={"include": config.pop("include")},
+                        mode="full",
+                    )
+                )
+            if self._config_is_python_task(config):
+                task_object = config["class"]()
+            else:
+                config = self._process_alias(config, group=group)
+                task_object = ConfigurableTask(config=config)
+            if group is not None:
+                task_object = (group, task_object)
+            return {task: task_object}
+
+        if isinstance(name_or_config, str):
+            if update_config is not None:
+                # Process name_or_config as a dict instead
+                name_or_config = {"task": name_or_config, **update_config}
+            elif self._name_is_task(name_or_config):
+                task_config = self._get_config(name_or_config)
+                return load_task(task_config, task=name_or_config, group=parent_name)
            else:
-                GROUP_REGISTRY[group] = [task]
-                ALL_TASKS.add(group)
+                group_name = name_or_config
+                subtask_list = self._get_tasklist(name_or_config)
+                if subtask_list == -1:
+                    group_config = self._get_config(name_or_config)
+                    subtask_list = group_config["task"]
+
+                # This checks if we're at the root.
+                if parent_name is None:
+                    group_config = self._get_config(name_or_config)
+                    if set(group_config.keys()) > set(["task", "group"]):
+                        update_config = {
+                            k:v for k,v in group_config.items() if k not in ["task", "group"]
+                        }
+                    yaml_path = self._get_yaml_path(group_name)

-    return 0
+                    if (update_config is not None) and ("group_alias" in update_config):
+                        group_name = update_config["group_alias"]
+                        update_config.pop("group_alias")

+        if isinstance(name_or_config, dict):

-def check_prompt_config(
-    config: Dict[str, str], yaml_path: str = None
-) -> List[Dict[str, str]]:
-    all_configs = []
-    if "use_prompt" in config:
-        prompt_list = prompts.load_prompt_list(
-            use_prompt=config["use_prompt"],
-            dataset_name=config["dataset_path"],
-            subset_name=config["dataset_name"] if "dataset_name" in config else None,
-            yaml_path=yaml_path,
-        )
-        for idx, prompt_variation in enumerate(prompt_list):
-            all_configs.append(
-                {
-                    **config,
-                    **{"use_prompt": prompt_variation},
-                    **{
-                        "task": "_".join(
-                            [
-                                config["task"]
-                                if "task" in config
-                                else get_task_name_from_config(config),
-                                prompt_variation.split("/")[-1]
-                                if ".yaml" in prompt_variation
-                                else prompt_variation,
-                            ]
-                        )
-                    },
-                    **{"output_type": "generate_until"},
+            if update_config is not None:
+                name_or_config={
+                    **name_or_config,
+                    **update_config,
                }
-            )
-    else:
-        all_configs.append(config)
-    return all_configs
-

-def get_task_name_from_config(task_config: Dict[str, str]) -> str:
-    if "dataset_name" in task_config:
-        return "{dataset_path}_{dataset_name}".format(**task_config)
-    else:
-        return "{dataset_path}".format(**task_config)
+            if self._config_is_task(name_or_config):
+                name = name_or_config["task"]
+                # If the name is registered as a group
+                # if self._name_is_task(name) is False:
+                if self._name_is_group(name):
+                    group_name = name
+                    update_config = {k:v for k,v in name_or_config.items() if k != "task"}
+                    subtask_list = self._get_tasklist(name)
+                    if subtask_list == -1:
+                        subtask_list = self._get_config(name)["task"]
+                else:
+                    if self._name_is_registered(name):
+                        base_task_config = self._get_config(name)
+
+                        # Check if this is a duplicate.
+                        if parent_name is not None:
+                            name_or_config["group"] = parent_name
+                            num_duplicate = len(list(filter(lambda x: x.startswith(name), self.task_group_map[parent_name])))
+                            if num_duplicate > 0:
+                                name = f"{name}-{num_duplicate}"
+                            self.task_group_map[parent_name].append(name)
+
+                        task_config={
+                                **base_task_config,
+                                **name_or_config,
+                            }
+                    else:
+                        task_config = name_or_config
+                    return load_task(task_config, task=name, group=parent_name, yaml_path=yaml_path)
+            else:
+                group_name = name_or_config["group"]
+                subtask_list = name_or_config["task"]
+                # update_config = {k:v for k,v in name_or_config.items() if k != "task"}
+                if set(name_or_config.keys()) > set(["task", "group"]):
+                    update_config = {
+                        k:v for k,v in name_or_config.items() if k not in ["task", "group"]
+                    }

+        all_subtasks = {}
+        if (parent_name is not None):
+            all_subtasks = {group_name: (parent_name, None)}

-def include_task_folder(task_dir: str, register_task: bool = True) -> None:
-    """
-    Calling this function
-    """
+        fn = partial(self._load_individual_task_or_group, parent_name=group_name, update_config=update_config, yaml_path=yaml_path)
+        all_subtasks = {**all_subtasks, **dict(collections.ChainMap(*map(fn, subtask_list)))}
+        return all_subtasks

-    # Track whether any tasks failed during loading
-    import_fail = False
-    for root, subdirs, file_list in os.walk(task_dir):
-        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-        for f in file_list:
-            if f.endswith(".yaml"):
-                yaml_path = os.path.join(root, f)
-                try:
-                    config = utils.load_yaml_config(yaml_path)
-
-                    if "task" not in config:
-                        continue
-
-                    all_configs = check_prompt_config(
-                        config, yaml_path=os.path.dirname(yaml_path)
-                    )
-                    for config in all_configs:
-                        if register_task:
-                            if type(config["task"]) == str:
-                                register_configurable_task(config)
-                        else:
-                            if type(config["task"]) == list:
-                                register_configurable_group(config, yaml_path)
-
-                # Log this silently and show it only when
-                # the user defines the appropriate verbosity.
-                except (ImportError, ModuleNotFoundError) as e:
-                    import_fail = True
-                    eval_logger.debug(
-                        f"{yaml_path}: {e}. Config will not be added to registry."
-                    )
-                except Exception as error:
-                    import traceback
-
-                    eval_logger.warning(
-                        "Unexpected error loading config in\n"
-                        f"                                 {yaml_path}\n"
-                        "                                 Config will not be added to registry\n"
-                        f"                                 Error: {error}\n"
-                        f"                                 Traceback: {traceback.format_exc()}"
-                    )

-    if import_fail:
-        eval_logger.warning(
-          "Some tasks could not be loaded due to missing dependencies."
-          " Run with `--verbosity DEBUG` for full details."
-          )
-    return 0
+    def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
+        """Loads a dictionary of task objects from a list

+        :param task_list: Union[str, list] = None
+            Single string or list of string of task names to be loaded

-def include_path(task_dir):
-    include_task_folder(task_dir)
-    # Register Benchmarks after all tasks have been added
-    include_task_folder(task_dir, register_task=False)
-    return 0
+        :return
+            Dictionary of task objects
+        """
+        if isinstance(task_list, str):
+            task_list = [task_list]

+        all_loaded_tasks = dict(
+            collections.ChainMap(
+                *map(
+                    self._load_individual_task_or_group,
+                    task_list
+                )
+            )
+        )
+        return all_loaded_tasks
+
+    def load_config(self, config: Dict):
+        return self._load_individual_task_or_group(config)
+
+    def _get_task_and_group(self, task_dir: str):
+        """Creates an dictionary of tasks index with the following metadata,
+        - `type`, that can be either `task`, `python_task`, or `group`.
+            `task` refer to regular task configs, `python_task` are special
+            yaml files that only consists of `task` and `class` parameters.
+            `group` are group configs.
+        - `yaml_path`, path to the yaml file. If the entry is a `group` that
+            was configured through a task config, the yaml_path will be -1
+            and all subtasks will be listed in `task` (see below)
+        - `task`, reserved for entries with `type` as `group`. This will list
+            all subtasks. When a group config is created (as opposed to task
+            config having `group` parameter set), this will be set to -1 to
+            avoid recursive indexing. The whole list of subtasks will be loaded
+            at evaluation.
+
+        :param task_dir: str
+            A directory to check for tasks
+
+        :return
+            Dictionary of task names as key and task metadata
+        """
+        tasks_and_groups = collections.defaultdict()
+        for root, _, file_list in os.walk(task_dir):
+            for f in file_list:
+                if f.endswith(".yaml"):
+                    yaml_path = os.path.join(root, f)
+                    config = utils.load_yaml_config(yaml_path, mode="simple")
+                    if self._config_is_python_task(config):
+                        # This is a python class config
+                        tasks_and_groups[config["task"]] = {
+                            "type": "python_task",
+                            "yaml_path": yaml_path,
+                        }
+                    elif self._config_is_group(config):
+                        # This is a group config
+                        tasks_and_groups[config["group"]] = {
+                            "type": "group",
+                            "task": -1, # This signals that
+                                        # we don't need to know
+                                        # the task list for indexing
+                                        # as it can be loaded
+                                        # when called.
+                            "yaml_path": yaml_path,
+                        }

-def initialize_tasks(verbosity="INFO"):
-    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+                        # # Registered the level 1 tasks from a group config
+                        # for config in config["task"]:
+                        #     if isinstance(config, dict) and self._config_is_task(config):
+                        #         task = config["task"]
+                        #         tasks_and_groups[task] = {
+                        #             "type": "task",
+                        #             "yaml_path": yaml_path,
+                        #             }
+
+                    elif self._config_is_task(config):
+                        # This is a task config
+                        task = config["task"]
+                        tasks_and_groups[task] = {
+                            "type": "task",
+                            "yaml_path": yaml_path,
+                            }

-    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-    include_path(task_dir)
+                        if "group" in config:
+                            groups = config["group"]
+                            if isinstance(config["group"], str):
+                                groups = [groups]
+
+                            for group in groups:
+                                if group not in tasks_and_groups:
+                                    tasks_and_groups[group] = {
+                                        "type": "group",
+                                        "task": [task],
+                                        "yaml_path": -1,
+                                    }
+                                else:
+                                    tasks_and_groups[group]["task"].append(task)
+                    else:
+                        self.logger.debug(f"File {f} in {root} could not be loaded")
+
+        return tasks_and_groups

+def include_path(task_dir):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, "INFO"))
+    logger.info(
+        "To still use tasks loaded from args.include_path,"
+        "see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
+    )
+    return 0

-def get_task(task_name, config):
-    try:
-        return TASK_REGISTRY[task_name](config=config)
-    except KeyError:
-        eval_logger.info("Available tasks:")
-        eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
-        raise KeyError(f"Missing task {task_name}")
+def initialize_tasks(verbosity="INFO"):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, f"{verbosity}"))
+    logger.info(
+        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
+        "It will be removed in v0.4.2 release. "
+        "TaskManager will instead be used."
+    )
+    return 0

+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
+    if "task" in task_config:
+        return task_config["task"]
+    if "dataset_name" in task_config:
+        return "{dataset_path}_{dataset_name}".format(**task_config)
+    else:
+        return "{dataset_path}".format(**task_config)

 def get_task_name_from_object(task_object):
-    for name, class_ in TASK_REGISTRY.items():
-        if class_ is task_object:
-            return name
+    if hasattr(task_object, "config"):
+        return task_object._config["task"]

    # TODO: scrap this
    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
@@ -234,54 +382,40 @@ def get_task_name_from_object(task_object):
        else type(task_object).__name__
    )

+def get_task_dict(task_name_list: List[Union[str, Dict, Task]], task_manager: TaskManager = None):
+    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.

-# TODO: pass num_fewshot and other cmdline overrides in a better way
-def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
-    config = {**kwargs}
+    :param task_name_list: List[Union[str, Dict, Task]]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param task_manager: TaskManager = None
+        A TaskManager object that stores indexed tasks. If not set,
+        task_manager will load one. This should be set by the user
+        if there are additional paths that want to be included
+        via `include_path`

-    task_name_from_registry_dict = {}
+    :return
+        Dictionary of task objects
+    """
+    task_name_from_string_dict = {}
    task_name_from_config_dict = {}
    task_name_from_object_dict = {}

-    if type(task_name_list) != list:
+    if isinstance(task_name_list, str):
        task_name_list = [task_name_list]

-    for task_element in task_name_list:
-        if isinstance(task_element, str):
-            if task_element in GROUP_REGISTRY:
-                group_name = task_element
-                for task_name in GROUP_REGISTRY[task_element]:
-                    if task_name not in task_name_from_registry_dict:
-                        task_obj = get_task_dict(task_name)
-                        if task_name in task_obj.keys():
-                            task_dict = {
-                                task_name: (group_name, task_obj[task_name]),
-                            }
-                        else:
-                            task_dict = {
-                                task_name: (group_name, None),
-                                **task_obj,
-                            }
+    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
+    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    if len(string_task_name_list) > 0:
+        if task_manager is None:
+            task_manager = TaskManager()

-                        task_name_from_registry_dict = {
-                            **task_name_from_registry_dict,
-                            **task_dict,
-                        }
-            else:
-                task_name = task_element
-                if task_name not in task_name_from_registry_dict:
-                    task_name_from_registry_dict = {
-                        **task_name_from_registry_dict,
-                        task_name: get_task(task_name=task_element, config=config),
-                    }
+        task_name_from_string_dict = task_manager.load_task_or_group(string_task_name_list)

-        elif isinstance(task_element, dict):
-            task_element.update(config)
+    for task_element in others_task_name_list:
+        if isinstance(task_element, dict):
            task_name_from_config_dict = {
                **task_name_from_config_dict,
-                get_task_name_from_config(task_element): ConfigurableTask(
-                    config=task_element
-                ),
+                **task_manager.load_config(config=task_element),
            }

        elif isinstance(task_element, Task):
@@ -290,11 +424,11 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
                get_task_name_from_object(task_element): task_element,
            }

-    assert set(task_name_from_registry_dict.keys()).isdisjoint(
+    assert set(task_name_from_string_dict.keys()).isdisjoint(
        set(task_name_from_object_dict.keys())
    )
    return {
-        **task_name_from_registry_dict,
+        **task_name_from_string_dict,
        **task_name_from_config_dict,
        **task_name_from_object_dict,
    }
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
 group:
  - ai2_arc
 task: arc_easy
-dataset_path: ai2_arc
+dataset_path: allenai/ai2_arc
 dataset_name: ARC-Easy
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -28,7 +28,7 @@ if __name__ == "__main__":

    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)

    base_doc_to_text = "Q: {{input}}\nA:"
@@ -70,7 +70,7 @@ if __name__ == "__main__":

        file_save_path = args.save_prefix_path + f"/{task}.yaml"
        utils.eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -28,3 +28,4 @@ filter_list:
 num_fewshot: 0
 metadata:
  version: 2.0
+  num_fewshot: 3 # controls what is printed in n-shot
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -19,3 +19,4 @@ generation_kwargs:
 num_fewshot: 0
 metadata:
  version: 1.0
+  num_fewshot: 3 # will be printed in results table
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -27,13 +27,13 @@ if __name__ == "__main__":

    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)

    if args.cot_prompt_path is not None:
        import json

-        with open(args.cot_prompt_path) as f:
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
            cot_file = json.load(f)

    def query():
@@ -42,7 +42,7 @@ if __name__ == "__main__":
    print(query())
    languages = [split["split"] for split in query()]

-    for lang in tqdm(languages):
+    for lang in tqdm([lang for lang in languages if "default" not in lang]):
        yaml_dict = {
            "include": base_yaml_name,
            "task": f"belebele_{args.task_prefix}_{lang}"
@@ -54,7 +54,7 @@ if __name__ == "__main__":

        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
        logging.info(f"Saving yaml for subset {lang} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/belebele/belebele_default.yaml
+++ b/lm_eval/tasks/belebele/belebele_default.yaml
-"fewshot_split": "default"
-"include": "_default_template_yaml"
-"task": "belebele_default"
-"test_split": "default"
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
 output_type: generate_until
-validation_split: validation
+test_split: null
+doc_to_choice: null
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
-group: flan_anli
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r3
--- a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
-group: flan_arc
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: arc_easy
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    task: arc_challenge
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
-group: flan_boolq
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: prompt_templates/boolq.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
-group: flan_cot
-task:
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: gsmk
-    dataset_name: boolq
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: EleutherAI/asdiv
-    use_prompt: promptsource:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
 group: flan_held_in
+group_alias: Flan (Held-In)
 task:
-  - flan_boolq
-  - flan_rte
-  - flan_anli
-  - flan_arc
+  # ANLI R1
+  - group: anli_r1_flan
+    group_alias: ANLI R1
+    task:
+      - task: anli_r1
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R2
+  - group: anli_r2_flan
+    group_alias: ANLI R2
+    task:
+      - task: anli_r2
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R3
+  - group: anli_r3_flan
+    group_alias: ANLI R3
+    task:
+      - task: anli_r3
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # Arc Easy
+  - group: arc_easy_flan
+    group_alias: Arc Easy
+    task:
+      - task: arc_easy
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # Arc Challenge
+  - group: arc_challenge_flan
+    group_alias: Arc Challenge
+    task:
+      - task: arc_challenge
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # BoolQ
+  - group: boolq_flan
+    group_alias: BoolQ
+    task:
+      - task: boolq
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-9
+        include: _held_in_template_yaml
+        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+  # RTE
+  - group: rte_flan
+    group_alias: RTE
+    task:
+      - task: rte
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
-group: flan_held_in
-task:
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: flan/prompt_templates/boolq.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: flan/prompt_templates/rte.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r3
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_easy
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_challenge
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
 group: flan_held_out
 task:
  # BBH
-  - bbh_flan_zeroshot
-  - bbh_flan_fewshot
-  - bbh_flan_cot_fewshot
-  - bbh_flan_cot_zeroshot
+  - bbh_zeroshot
+  - bbh_fewshot
+  - bbh_cot_fewshot
+  - bbh_cot_zeroshot
  # MMLU
  - mmlu
  - mmlu_flan_n_shot_generative

--- a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
-group: flan_rte
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: prompt_templates/rte.yaml:*
-    validation_split: validation