merged with latest update from main

cb8889cc · lintangsutawika · ec05e561 · 74119471 · cb8889cc · cb8889cc
Commit cb8889cc authored Feb 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -108,8 +108,8 @@ class HFLM(LM):
            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
            self._model = pretrained
            self._device = self._model.device
            self._config = self._model.config
+            gpus = 0
            if tokenizer:
                assert isinstance(
@@ -200,6 +200,7 @@ class HFLM(LM):
            )
        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
            self.model.eval()
            self.model.tie_weights()
@@ -238,6 +239,16 @@ class HFLM(LM):
            if self.config.model_type == "qwen":
                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
            else:
                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
@@ -361,7 +372,7 @@ class HFLM(LM):
    def _get_backend(
        self,
-        config: transformers.AutoConfig,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
        trust_remote_code: Optional[bool] = False,
    ) -> None:
@@ -602,8 +613,7 @@ class HFLM(LM):
                    (batch_size, max_length), device=self.device
                ).long()
            for _ in range(5):
-                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
-                out = out  # Identity process so that it passes pre-commit
            return batch_size
@@ -705,10 +715,14 @@ class HFLM(LM):
                return self.model(inps).logits
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        # we require users to pass do_sample=True explicitly
+        # temperature = 0.0 if not set
-        # for non-greedy gen. This should be reevaluated when considering beam search.
+        # if do_sample is false and temp==0.0:
-        if "do_sample" not in generation_kwargs:
+        # remove temperature, as do_sample=False takes care of this
-            generation_kwargs["do_sample"] = False
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
        # build stopping criteria
        stopping_criteria = stop_sequences_criteria(
            self.tokenizer, stop, context.shape[1], context.shape[0]
@@ -1045,6 +1059,7 @@ class HFLM(LM):
            return -len(toks), x[0]
        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
            print("Passed argument batch_size = auto. Detecting largest batch size")
@@ -1089,7 +1104,7 @@ class HFLM(LM):
                        )
            else:
                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
            if not until:
                until = [self.tok_decode(self.eot_token_id)]

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
+from importlib.util import find_spec
+from pathlib import Path
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+@register_model("openvino")
+class OptimumLM(HFLM):
+    """
+    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+    Intel® architectures using OpenVINO™ runtime.
+    """
+    def __init__(
+        self,
+        device="cpu",
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # optimum currently only supports causal models
+            assert (
+                kwargs["backend"] == "causal"
+            ), "Currently, only OVModelForCausalLM is supported."
+        self.openvino_device = device
+        super().__init__(
+            device=self.openvino_device,
+            backend=kwargs.get("backend", "causal"),
+            **kwargs,
+        )
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise Exception(
+                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
+            )
+        else:
+            from optimum.intel.openvino import OVModelForCausalLM
+        model_kwargs = kwargs if kwargs else {}
+        model_file = Path(pretrained) / "openvino_model.xml"
+        if model_file.exists():
+            export = False
+        else:
+            export = True
+        kwargs["ov_config"] = {
+            "PERFORMANCE_HINT": "LATENCY",
+            "NUM_STREAMS": "1",
+            "CACHE_DIR": "",
+        }
+        self._model = OVModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            export=export,
+            device=self.openvino_device.upper(),
+            **model_kwargs,
+        )
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -170,18 +170,12 @@ class VLLM(LM):
        stop: Optional[List[str]] = None,
        **kwargs,
    ):
-        if "do_sample" in kwargs.keys():
-            kwargs.pop("do_sample")
        if generate:
-            # hf defaults
+            kwargs = self.modify_gen_kwargs(kwargs)
-            kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
-            kwargs["spaces_between_special_tokens"] = kwargs.get(
-                "spaces_between_special_tokens", False
-            )
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=2, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1
            )
        if self.data_parallel_size > 1:
            requests = [list(x) for x in divide(requests, self.data_parallel_size)]
@@ -438,3 +432,16 @@ class VLLM(LM):
                    break
        return continuation_logprobs, is_greedy
+    @staticmethod
+    def modify_gen_kwargs(kwargs: dict) -> dict:
+        # sampling_params
+        do_sample = kwargs.pop("do_sample", None)
+        if do_sample is False or "temperature" not in kwargs:
+            kwargs["temperature"] = 0.0
+        # hf defaults
+        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
+        kwargs["spaces_between_special_tokens"] = kwargs.get(
+            "spaces_between_special_tokens", False
+        )
+        return kwargs
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -117,7 +117,7 @@ class PromptString:
        # TODO need a way to process doc_to_choice
        if "doc_to_choice" in self.prompt_string:
-            raise "Not yet implemented to accept doc_to_choice"
+            raise Exception("Not yet implemented to accept doc_to_choice")
        text_string = utils.apply_template(doc_to_text, doc)
        target_string = utils.apply_template(doc_to_target, doc)

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import os
 import abc
-import yaml
 import collections
 from functools import partial
 from typing import List, Union, Dict
 from lm_eval import utils
-from lm_eval import prompts
+from lm_eval.api.task import Task, ConfigurableTask
-from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
 import logging
-# # import python tasks
-# import squadv2.task
-# import scrolls.task
-# python_tasks = {
-#     "squadv2": squadv2.task.SQuAD2,
-#     "scrolls_quality": scrolls.task.QuALITY,
-#     "scrolls_narrativeqa": scrolls.task.NarrativeQA,
-#     "scrolls_contractnli": scrolls.task.ContractNLI,
-#     "scrolls_govreport": scrolls.task.GovReport,
-#     "scrolls_summscreenfd": scrolls.task.SummScreenFD,
-#     "scrolls_qmsum": scrolls.task.QMSum,
-# }
-eval_logger = utils.eval_logger
+class TaskManager:
+    """TaskManager indexes all tasks from the default `lm_eval/tasks/`
-GROUP_KEYS = ["group", "task", "weight_by_size"]
+    and an optional directory if provided.
-PYTHON_TASK_KEYS = ["task", "class"]
-class TaskManager(abc.ABC):
+    """
    def __init__(
        self,
        verbosity="INFO",
@@ -40,79 +24,132 @@ class TaskManager(abc.ABC):
        self.verbosity = verbosity
        self.include_path = include_path
-        self.logger = eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+        self.logger = utils.eval_logger
+        self.logger.setLevel(getattr(logging, f"{verbosity}"))
-        self.ALL_TASKS = self.initialize_tasks(
+        self._task_index = self.initialize_tasks(
            include_path=include_path
            )
+        self._all_tasks = sorted(list(self._task_index.keys()))
+        self.task_group_map = collections.defaultdict(list)
-    def initialize_tasks(self, include_path=None):
+    def initialize_tasks(self, include_path: str = None):
+        """Creates an dictionary of tasks index.
+        :param include_path: str = None
+            An additional path to be searched for tasks
+        :return
+            Dictionary of task names as key and task metadata
+        """
        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
        if include_path is not None:
            if isinstance(include_path, str):
                include_path = [include_path]
            all_paths.extend(include_path)
-        ALL_TASKS = {}
+        task_index = {}
        for task_dir in all_paths:
            tasks = self._get_task_and_group(task_dir)
-            ALL_TASKS = {**tasks, **ALL_TASKS}
+            task_index = {**tasks, **task_index}
-        return ALL_TASKS
+        return task_index
+    @property
    def all_tasks(self):
-        return sorted(list(self.ALL_TASKS.keys()))
+        return self._all_tasks
+    @property
+    def task_index(self):
+        return self._task_index
+    def match_tasks(self, task_list):
+        return utils.pattern_match(
+            task_list, self.all_tasks
+        )
    def _name_is_registered(self, name):
-        if name in self.ALL_TASKS:
+        if name in self.all_tasks:
            return True
        return False
    def _name_is_task(self, name):
-        if self._name_is_registered(name) and ("task" in self.ALL_TASKS[name]["type"]):
+        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+            return True
+        return False
+    def _name_is_group(self, name):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "group"):
            return True
        return False
    def _name_is_python_task(self, name):
-        if self._name_is_registered(name) and (self.ALL_TASKS[name]["type"] == "python_task"):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "python_task"):
            return True
        return False
    def _config_is_task(self, config):
-        if set(config.keys()) <= set(GROUP_KEYS):
+        if ("task" in config) and isinstance(config["task"], str):
+            return True
        return False
+    def _config_is_group(self, config):
+        if ("task" in config) and isinstance(config["task"], list):
            return True
+        return False
    def _config_is_python_task(self, config):
-        if set(config.keys()) == set(PYTHON_TASK_KEYS):
+        if "class" in config:
            return True
        return False
    def _get_yaml_path(self, name):
-        assert name in self.ALL_TASKS
+        assert name in self.task_index
-        return self.ALL_TASKS[name]["yaml_path"]
+        return self.task_index[name]["yaml_path"]
    def _get_config(self, name):
-        assert name in self.ALL_TASKS
+        assert name in self.task_index
        yaml_path = self._get_yaml_path(name)
-        return utils.load_yaml_config("full", yaml_path)
+        if yaml_path == -1:
+            return {}
+        else:
+            return utils.load_yaml_config(yaml_path, mode="full")
    def _get_tasklist(self, name):
        assert self._name_is_task(name) == False
-        return self.ALL_TASKS[name]["task"]
+        return self.task_index[name]["task"]
+    def _process_alias(self, config, group=None):
+        # If the group is not the same as the original
+        # group which the group alias was intended for,
+        # Set the group_alias to None instead.
+        if ("group_alias" in config) and ("group" in config) and group is not None:
+            if config["group"] != group:
+                config["group_alias"] = None
+        return config
    def _load_individual_task_or_group(
            self,
            name_or_config: Union[str, dict] = None,
            parent_name: str = None,
-            update_config: dict = None
+            update_config: dict = None,
+            yaml_path: str = None,
        ) -> ConfigurableTask:
+        def load_task(config, task, group=None, yaml_path=None):
-        def load_task(config, task, group=None, is_python_class=False):
+            if "include" in config:
-            if is_python_class:
+                assert yaml_path is not None
+                config.update(
+                    utils.load_yaml_config(
+                        yaml_path,
+                        yaml_config={"include": config.pop("include")},
+                        mode="full",
+                    )
+                )
+            if self._config_is_python_task(config):
                task_object = config["class"]()
            else:
+                config = self._process_alias(config, group=group)
                task_object = ConfigurableTask(config=config)
            if group is not None:
                task_object = (group, task_object)
@@ -124,15 +161,26 @@ class TaskManager(abc.ABC):
                name_or_config = {"task": name_or_config, **update_config}
            elif self._name_is_task(name_or_config):
                task_config = self._get_config(name_or_config)
-                is_python_class=False
+                return load_task(task_config, task=name_or_config, group=parent_name)
-                if self._name_is_python_task(name_or_config):
-                    is_python_class=True
-                return load_task(task_config, task=name_or_config, group=parent_name, is_python_class=is_python_class)
            else:
                group_name = name_or_config
                subtask_list = self._get_tasklist(name_or_config)
                if subtask_list == -1:
-                    subtask_list = self._get_config(name_or_config)["task"]
+                    group_config = self._get_config(name_or_config)
+                    subtask_list = group_config["task"]
+                # This checks if we're at the root.
+                if parent_name is None:
+                    group_config = self._get_config(name_or_config)
+                    if set(group_config.keys()) > set(["task", "group"]):
+                        update_config = {
+                            k:v for k,v in group_config.items() if k not in ["task", "group"]
+                        }
+                    yaml_path = self._get_yaml_path(group_name)
+                    if (update_config is not None) and ("group_alias" in update_config):
+                        group_name = update_config["group_alias"]
+                        update_config.pop("group_alias")
        if isinstance(name_or_config, dict):
@@ -145,7 +193,8 @@ class TaskManager(abc.ABC):
            if self._config_is_task(name_or_config):
                name = name_or_config["task"]
                # If the name is registered as a group
-                if self._name_is_task(name) is False:
+                # if self._name_is_task(name) is False:
+                if self._name_is_group(name):
                    group_name = name
                    update_config = {k:v for k,v in name_or_config.items() if k != "task"}
                    subtask_list = self._get_tasklist(name)
@@ -154,28 +203,49 @@ class TaskManager(abc.ABC):
                else:
                    if self._name_is_registered(name):
                        base_task_config = self._get_config(name)
+                        # Check if this is a duplicate.
+                        if parent_name is not None:
+                            name_or_config["group"] = parent_name
+                            num_duplicate = len(list(filter(lambda x: x.startswith(name), self.task_group_map[parent_name])))
+                            if num_duplicate > 0:
+                                name = f"{name}-{num_duplicate}"
+                            self.task_group_map[parent_name].append(name)
                        task_config={
                                **base_task_config,
                                **name_or_config,
                            }
                    else:
                        task_config = name_or_config
-                    return load_task(task_config, task=name, group=parent_name)
+                    return load_task(task_config, task=name, group=parent_name, yaml_path=yaml_path)
            else:
                group_name = name_or_config["group"]
                subtask_list = name_or_config["task"]
+                # update_config = {k:v for k,v in name_or_config.items() if k != "task"}
+                if set(name_or_config.keys()) > set(["task", "group"]):
+                    update_config = {
+                        k:v for k,v in name_or_config.items() if k not in ["task", "group"]
+                    }
        all_subtasks = {}
-        if (parent_name is not None) and ((self._name_is_registered(group_name) is False) or (self._get_yaml_path(group_name) == -1)):
+        if (parent_name is not None):
            all_subtasks = {group_name: (parent_name, None)}
-        fn = partial(self._load_individual_task_or_group, parent_name=group_name, update_config=update_config)
+        fn = partial(self._load_individual_task_or_group, parent_name=group_name, update_config=update_config, yaml_path=yaml_path)
        all_subtasks = {**all_subtasks, **dict(collections.ChainMap(*map(fn, subtask_list)))}
        return all_subtasks
    def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
+        """Loads a dictionary of task objects from a list
+        :param task_list: Union[str, list] = None
+            Single string or list of string of task names to be loaded
+        :return
+            Dictionary of task objects
+        """
        if isinstance(task_list, str):
            task_list = [task_list]
@@ -189,20 +259,43 @@ class TaskManager(abc.ABC):
        )
        return all_loaded_tasks
+    def load_config(self, config: Dict):
+        return self._load_individual_task_or_group(config)
    def _get_task_and_group(self, task_dir: str):
+        """Creates an dictionary of tasks index with the following metadata,
+        - `type`, that can be either `task`, `python_task`, or `group`.
+            `task` refer to regular task configs, `python_task` are special
+            yaml files that only consists of `task` and `class` parameters.
+            `group` are group configs.
+        - `yaml_path`, path to the yaml file. If the entry is a `group` that
+            was configured through a task config, the yaml_path will be -1
+            and all subtasks will be listed in `task` (see below)
+        - `task`, reserved for entries with `type` as `group`. This will list
+            all subtasks. When a group config is created (as opposed to task
+            config having `group` parameter set), this will be set to -1 to
+            avoid recursive indexing. The whole list of subtasks will be loaded
+            at evaluation.
+        :param task_dir: str
+            A directory to check for tasks
+        :return
+            Dictionary of task names as key and task metadata
+        """
        tasks_and_groups = collections.defaultdict()
        for root, _, file_list in os.walk(task_dir):
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)
-                    config = utils.load_yaml_config("simple", yaml_path)
+                    config = utils.load_yaml_config(yaml_path, mode="simple")
-                    if set(config.keys()) == set(PYTHON_TASK_KEYS):
+                    if self._config_is_python_task(config):
                        # This is a python class config
                        tasks_and_groups[config["task"]] = {
                            "type": "python_task",
                            "yaml_path": yaml_path,
                        }
-                    elif set(config.keys()) <= set(GROUP_KEYS):
+                    elif self._config_is_group(config):
                        # This is a group config
                        tasks_and_groups[config["group"]] = {
                            "type": "group",
@@ -213,7 +306,17 @@ class TaskManager(abc.ABC):
                                        # when called.
                            "yaml_path": yaml_path,
                        }
-                    else:
+                        # # Registered the level 1 tasks from a group config
+                        # for config in config["task"]:
+                        #     if isinstance(config, dict) and self._config_is_task(config):
+                        #         task = config["task"]
+                        #         tasks_and_groups[task] = {
+                        #             "type": "task",
+                        #             "yaml_path": yaml_path,
+                        #             }
+                    elif self._config_is_task(config):
                        # This is a task config
                        task = config["task"]
                        tasks_and_groups[task] = {
@@ -235,41 +338,97 @@ class TaskManager(abc.ABC):
                                    }
                                else:
                                    tasks_and_groups[group]["task"].append(task)
+                    else:
+                        self.logger.debug(f"File {f} in {root} could not be loaded")
        return tasks_and_groups
+def include_path(task_dir):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, "INFO"))
+    logger.info(
+        "To still use tasks loaded from args.include_path,"
+        "see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
+    )
+    return 0
+def initialize_tasks(verbosity="INFO"):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, f"{verbosity}"))
+    logger.info(
+        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
+        "It will be removed in v0.4.2 release. "
+        "TaskManager will instead be used."
+    )
+    return 0
+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
+    if "task" in task_config:
+        return task_config["task"]
+    if "dataset_name" in task_config:
+        return "{dataset_path}_{dataset_name}".format(**task_config)
+    else:
+        return "{dataset_path}".format(**task_config)
+def get_task_name_from_object(task_object):
+    if hasattr(task_object, "config"):
+        return task_object._config["task"]
+    # TODO: scrap this
+    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
+    return (
+        task_object.EVAL_HARNESS_NAME
+        if hasattr(task_object, "EVAL_HARNESS_NAME")
+        else type(task_object).__name__
+    )
+def get_task_dict(task_name_list: List[Union[str, Dict, Task]], task_manager: TaskManager = None):
+    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
+    :param task_name_list: List[Union[str, Dict, Task]]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param task_manager: TaskManager = None
+        A TaskManager object that stores indexed tasks. If not set,
+        task_manager will load one. This should be set by the user
+        if there are additional paths that want to be included
+        via `include_path`
+    :return
+        Dictionary of task objects
+    """
+    task_name_from_string_dict = {}
+    task_name_from_config_dict = {}
+    task_name_from_object_dict = {}
+    if isinstance(task_name_list, str):
+        task_name_list = [task_name_list]
+    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
+    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    if len(string_task_name_list) > 0:
+        if task_manager is None:
+            task_manager = TaskManager()
+        task_name_from_string_dict = task_manager.load_task_or_group(string_task_name_list)
+    for task_element in others_task_name_list:
+        if isinstance(task_element, dict):
+            task_name_from_config_dict = {
+                **task_name_from_config_dict,
+                **task_manager.load_config(config=task_element),
+            }
+        elif isinstance(task_element, Task):
+            task_name_from_object_dict = {
+                **task_name_from_object_dict,
+                get_task_name_from_object(task_element): task_element,
+            }
-# def check_prompt_config(
+    assert set(task_name_from_string_dict.keys()).isdisjoint(
-#     config: Dict[str, str], yaml_path: str = None
+        set(task_name_from_object_dict.keys())
-# ) -> List[Dict[str, str]]:
+    )
-#     all_configs = []
+    return {
-#     if "use_prompt" in config:
+        **task_name_from_string_dict,
-#         prompt_list = prompts.load_prompt_list(
+        **task_name_from_config_dict,
-#             use_prompt=config["use_prompt"],
+        **task_name_from_object_dict,
-#             dataset_name=config["dataset_path"],
+    }
-#             subset_name=config["dataset_name"] if "dataset_name" in config else None,
-#             yaml_path=yaml_path,
-#         )
-#         for idx, prompt_variation in enumerate(prompt_list):
-#             all_configs.append(
-#                 {
-#                     **config,
-#                     **{"use_prompt": prompt_variation},
-#                     **{
-#                         "task": "_".join(
-#                             [
-#                                 config["task"]
-#                                 if "task" in config
-#                                 else get_task_name_from_config(config),
-#                                 prompt_variation.split("/")[-1]
-#                                 if ".yaml" in prompt_variation
-#                                 else prompt_variation,
-#                             ]
-#                         )
-#                     },
-#                     **{"output_type": "generate_until"},
-#                 }
-#             )
-#     else:
-#         all_configs.append(config)
-#     return all_configs
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -28,7 +28,7 @@ if __name__ == "__main__":
    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)
    base_doc_to_text = "Q: {{input}}\nA:"
@@ -70,7 +70,7 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"/{task}.yaml"
        utils.eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -29,3 +29,4 @@ filter_list:
 num_fewshot: 0
 metadata:
  version: 2.0
+  num_fewshot: 3 # controls what is printed in n-shot
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -20,3 +20,4 @@ generation_kwargs:
 num_fewshot: 0
 metadata:
  version: 1.0
+  num_fewshot: 3 # will be printed in results table
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -27,13 +27,13 @@ if __name__ == "__main__":
    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)
    if args.cot_prompt_path is not None:
        import json
-        with open(args.cot_prompt_path) as f:
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
            cot_file = json.load(f)
    def query():
@@ -42,7 +42,7 @@ if __name__ == "__main__":
    print(query())
    languages = [split["split"] for split in query()]
-    for lang in tqdm(languages):
+    for lang in tqdm([lang for lang in languages if "default" not in lang]):
        yaml_dict = {
            "include": base_yaml_name,
            "task": f"belebele_{args.task_prefix}_{lang}"
@@ -54,7 +54,7 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
        logging.info(f"Saving yaml for subset {lang} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/belebele/belebele_default.yaml
+++ b/lm_eval/tasks/belebele/belebele_default.yaml
-"fewshot_split": "default"
-"include": "_default_template_yaml"
-"task": "belebele_default"
-"test_split": "default"
--- a/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
+output_type: generate_until
+test_split: null
+doc_to_choice: null
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+group: flan_held_in
+group_alias: Flan (Held-In)
+task:
+  # ANLI R1
+  - group: anli_r1_flan
+    group_alias: ANLI R1
+    task:
+      - task: anli_r1
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R2
+  - group: anli_r2_flan
+    group_alias: ANLI R2
+    task:
+      - task: anli_r2
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R3
+  - group: anli_r3_flan
+    group_alias: ANLI R3
+    task:
+      - task: anli_r3
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # Arc Easy
+  - group: arc_easy_flan
+    group_alias: Arc Easy
+    task:
+      - task: arc_easy
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # Arc Challenge
+  - group: arc_challenge_flan
+    group_alias: Arc Challenge
+    task:
+      - task: arc_challenge
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # BoolQ
+  - group: boolq_flan
+    group_alias: BoolQ
+    task:
+      - task: boolq
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-9
+        include: _held_in_template_yaml
+        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+  # RTE
+  - group: rte_flan
+    group_alias: RTE
+    task:
+      - task: rte
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+group: flan_held_out
+task:
+  # BBH
+  - bbh_zeroshot
+  - bbh_fewshot
+  - bbh_cot_fewshot
+  - bbh_cot_zeroshot
+  # MMLU
+  - mmlu
+  - mmlu_flan_n_shot_generative
+  - mmlu_flan_n_shot_loglikelihood
+  - mmlu_flan_cot_zeroshot
+  - mmlu_flan_cot_fewshot
--- a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
@@ -5,19 +5,13 @@ task:
  - medqa_4options
  - task: mmlu_anatomy
    task_alias: "anatomy (mmlu)"
-    group_alias: null
  - task: mmlu_clinical_knowledge
    task_alias: "clinical_knowledge (mmlu)"
-    group_alias: null
  - task: mmlu_college_medicine
    task_alias: "college_medicine (mmlu)"
-    group_alias: null
  - task: mmlu_medical_genetics
    task_alias: "medical_genetics (mmlu)"
-    group_alias: null
  - task: mmlu_professional_medicine
    task_alias: "professional_medicine (mmlu)"
-    group_alias: null
  - task: mmlu_college_biology
    task_alias: "college_biology (mmlu)"
-    group_alias: null
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -181,7 +181,7 @@ def main() -> None:
        for task in all_subtasks:
            file_name = f"{task}.yaml"
            try:
-                with open(f"{path}/{file_name}", "w") as f:
+                with open(f"{path}/{file_name}", "w", encoding="utf-8") as f:
                    f.write("# Generated by utils.py\n")
                    yaml.dump(
                        {

--- a/lm_eval/tasks/blimp/generate_configs.py
+++ b/lm_eval/tasks/blimp/generate_configs.py
@@ -75,7 +75,7 @@ def main() -> None:
    for task in all_subtasks:
        file_name = f"{task}.yaml"
        try:
-            with open(f"{file_name}", "w") as f:
+            with open(f"{file_name}", "w", encoding="utf-8") as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -79,13 +79,13 @@ if __name__ == "__main__":
    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)
    if args.cot_prompt_path is not None:
        import json
-        with open(args.cot_prompt_path) as f:
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
            cot_file = json.load(f)
    for subject_eng, subject_zh in tqdm(SUBJECTS.items()):
@@ -107,7 +107,7 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
        eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -94,13 +94,13 @@ if __name__ == "__main__":
    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)
    if args.cot_prompt_path is not None:
        import json
-        with open(args.cot_prompt_path) as f:
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
            cot_file = json.load(f)
    for subject_eng, subject_zh in tqdm(SUBJECTS.items()):
@@ -122,7 +122,7 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
        eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,

--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -184,7 +184,7 @@ def splitPuncts(line):
 def computeMaps(predictions, goldfile):
    predictionMap: Dict[str, list] = {}
    goldMap: Dict[str, list] = {}
-    gf = open(goldfile, "r")
+    gf = open(goldfile, "r", encoding="utf-8")
    for row in predictions:
        cols = row.strip().split("\t")

--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
@@ -25,7 +25,7 @@ if __name__ == "__main__":
    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path) as f:
+    with open(args.base_yaml_path, encoding="utf-8") as f:
        base_yaml = yaml.full_load(f)
    for name in tqdm(SUBSETS):
@@ -39,7 +39,7 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{name.lower()}.yaml"
        eval_logger.info(f"Saving yaml for subset {name} to {file_save_path}")
-        with open(file_save_path, "w") as yaml_file:
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
            yaml.dump(
                yaml_dict,
                yaml_file,