Merge branch 'main' into llama

# Conflicts: # lm_eval/tasks/llama3/README.md

Merge branch 'main' into llama
# Conflicts: # lm_eval/tasks/llama3/README.md
bc4b922c · Baber · 748eb47e · b2c090cc · bc4b922c · bc4b922c
Commit bc4b922c authored Jan 21, 2025 by Baber
20 changed files
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Pre-Commit
@@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 30
    steps:
    - name: Checkout Code
@@ -75,15 +75,16 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -U transformers peft
    - name: Test with pytest
      run: python -m pytest tests/models --showlocals -s -vv
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.4
+    rev: v0.9.2
    hooks:
      # Run the linter.
      - id: ruff

--- a/README.md
+++ b/README.md
@@ -270,6 +270,7 @@ Note that for externally hosted models, configs such as `--device` which relate
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
 | Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Huggingface Optimum-intel IPEX (Causal LMs)    | ✔️         | `ipex`                                          | Any decoder-only AutoModelForCausalLM                                                                                                                                                                                                                      |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
 | [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
@@ -492,6 +493,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | hf_transfer     | For speeding up HF Hub file downloads        |
 | ifeval          | For running the IFEval task                  |
 | ibm_watsonx_ai  | For using IBM watsonx.ai model apis          |
+| ipex            | For running on optimum-intel ipex backend    |
 | neuronx         | For running on AWS inf2 instances            |
 | mamba           | For loading Mamba SSM models                 |
 | math            | For running math task answer checking        |

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -58,7 +58,7 @@ This mode supports a number of command-line arguments, the details of which can
 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
-* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.
 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -190,7 +190,8 @@ doc_to_target: "{{answer}}"
 ```
-**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
+> [!WARNING]
+> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.
 #### Multiple choice format
@@ -206,7 +207,7 @@ doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
 ```
 Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.
-The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.
 ```yaml
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -37,6 +37,7 @@ Prompting / in-context formatting options:
 - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
+- **assistant_prefill** (`str`, *optional*) — String to append after the <|assistant|> token. For example, if the task is to generate a question, the assistant_prefill could be "The answer is: " to prompt the model to generate an answer to the question. If not using a chat template then this string will be appended to the end of the prompt.
 Runtime configuration options:
 - **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -257,6 +257,11 @@ def setup_parser() -> argparse.ArgumentParser:
        action="store_true",
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
+    parser.add_argument(
+        "--confirm_run_unsafe_code",
+        action="store_true",
+        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
+    )
    return parser
@@ -404,6 +409,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        numpy_random_seed=args.seed[1],
        torch_random_seed=args.seed[2],
        fewshot_random_seed=args.seed[3],
+        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
        **request_caching_args,
    )

--- a/lm_eval/api/group.py
+++ b/lm_eval/api/group.py
@@ -112,6 +112,4 @@ class ConfigurableGroup(abc.ABC):
        return self._config.group
    def __repr__(self):
-        return (
+        return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"
-            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
-        )
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -527,9 +527,9 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
 def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
-    assert (
+    assert metrics is not None, (
-        metrics is not None
+        "Need to pass a list of each subtask's metric for this stderr aggregation"
-    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    )
    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -113,13 +113,17 @@ class LM(abc.ABC):
        """
        pass
-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+    ) -> str:
        """
        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
        :param chat_history: list[dict[str, str]]
            A list of dictionaries with keys 'role' and 'content'.
            Values are strings representing the role name and the content of the message, respectively.
+        :param add_generation_prompt: bool
+            Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
        :return: str
            A string representing the chat history in a format that can be used as input to the LM.
        """

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -17,13 +17,13 @@ def register_model(*names):
    def decorate(cls):
        for name in names:
-            assert issubclass(
+            assert issubclass(cls, LM), (
-                cls, LM
+                f"Model '{name}' ({cls.__name__}) must extend LM class"
-            ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+            )
-            assert (
+            assert name not in MODEL_REGISTRY, (
-                name not in MODEL_REGISTRY
+                f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
-            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            )
            MODEL_REGISTRY[name] = cls
        return cls
@@ -48,9 +48,9 @@ func2task_index = {}
 def register_task(name):
    def decorate(fn):
-        assert (
+        assert name not in TASK_REGISTRY, (
-            name not in TASK_REGISTRY
+            f"task named '{name}' conflicts with existing registered task!"
-        ), f"task named '{name}' conflicts with existing registered task!"
+        )
        TASK_REGISTRY[name] = fn
        ALL_TASKS.add(name)
@@ -104,9 +104,9 @@ def register_metric(**args):
        ]:
            if key in args:
                value = args[key]
-                assert (
+                assert value not in registry, (
-                    value not in registry
+                    f"{key} named '{value}' conflicts with existing registered {key}!"
-                ), f"{key} named '{value}' conflicts with existing registered {key}!"
+                )
                if key == "metric":
                    registry[name] = fn
@@ -140,9 +140,9 @@ def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
 def register_aggregation(name: str):
    def decorate(fn):
-        assert (
+        assert name not in AGGREGATION_REGISTRY, (
-            name not in AGGREGATION_REGISTRY
+            f"aggregation named '{name}' conflicts with existing registered aggregation!"
-        ), f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        )
        AGGREGATION_REGISTRY[name] = fn
        return fn

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 from functools import partial
+from typing import TYPE_CHECKING, Iterable, Optional, Union
 import datasets
+if TYPE_CHECKING:
+    from random import Random
+    from lm_eval.api.task import ConfigurableTask, Task
 class ContextSampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+    def __init__(
+        self,
+        docs: list[dict],
+        task: Union["Task", "ConfigurableTask"],
+        fewshot_indices: Optional[Iterable] = None,
+        rnd: Optional["Random"] = None,
+    ) -> None:
        self.rnd = rnd
        if not self.rnd:
            raise ValueError(
@@ -58,8 +71,9 @@ class ContextSampler:
                )
            self.docs = self.docs.select(fewshot_indices)
-    def get_context(self, doc, num_fewshot):
+    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
        # draw an extra fewshot sample if using same split as evaluating on
+        prefix = gen_prefix + " " if gen_prefix else ""
        n_samples = (
            num_fewshot + 1
            if self.config.fewshot_split == self.config.test_split
@@ -77,14 +91,14 @@ class ContextSampler:
        for doc in selected_docs:
            doc_content = self.doc_to_text(doc)
            doc_target = self.doc_to_target(doc)
-            labeled_examples += (
+            if self.config.doc_to_choice is None or isinstance(doc_content, str):
-                doc_content
+                labeled_examples += doc_content
-                if self.config.doc_to_choice is None or isinstance(doc_content, str)
+            else:
-                else self.doc_to_choice(doc)[doc_content]
+                labeled_examples += self.doc_to_choice(doc)[doc_content]
-            )
            if doc_target != "":
                labeled_examples += self.target_delimiter
+                labeled_examples += prefix
                labeled_examples += (
                    str(doc_target[0])
                    if isinstance(doc_target, list)
@@ -98,10 +112,13 @@ class ContextSampler:
    def get_chat_context(
        self,
-        doc,
+        doc: dict,
-        num_fewshot,
+        num_fewshot: int,
        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
    ):
+        # TODO: Do we need any other delimiter
+        prefix = gen_prefix + " " if gen_prefix else ""
        chat_history = []
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
@@ -132,23 +149,28 @@ class ContextSampler:
                chat_history.append(
                    {
                        "role": "assistant",
-                        "content": str(doc_target[0])
+                        "content": prefix + str(doc_target[0])
                        if isinstance(doc_target, list)
-                        else doc_target
+                        else prefix + doc_target
                        if self.config.doc_to_choice is None
                        or isinstance(doc_target, str)
-                        else str(self.doc_to_choice(doc)[doc_target]),
+                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
                    }
                )
        else:
            # get fewshot context as one user turn
            chat_history.append(
-                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+                {
+                    "role": "user",
+                    "content": self.get_context(
+                        doc, num_fewshot, gen_prefix=gen_prefix
+                    ),
+                }
            )
        return chat_history
-    def sample(self, n):
+    def sample(self, n: int):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
        """
@@ -157,19 +179,19 @@ class ContextSampler:
 class FirstNSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """
        Draw the first `n` samples in order from the specified split.
        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
        """
-        assert (
+        assert n <= len(self.docs), (
-            n <= len(self.docs)
+            f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
-        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        )
        return self.docs[:n]
 class BalancedSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -179,7 +201,7 @@ class BalancedSampler(ContextSampler):
 class ManualSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """ """
        pass
@@ -190,7 +212,7 @@ SAMPLER_REGISTRY = {
 }
-def get_sampler(name):
+def get_sampler(name: str):
    try:
        return SAMPLER_REGISTRY[name]
    except KeyError:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -75,6 +75,7 @@ class TaskConfig(dict):
    doc_to_text: Optional[Union[Callable, str]] = None
    doc_to_target: Optional[Union[Callable, str]] = None
    doc_to_image: Union[Callable, str] = None
+    unsafe_code: bool = False
    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
    process_results: Optional[Union[Callable, str]] = None
    use_prompt: Optional[str] = None
@@ -92,6 +93,7 @@ class TaskConfig(dict):
    filter_list: Optional[Union[str, list]] = None
    should_decontaminate: bool = False
    doc_to_decontamination_query: Optional[str] = None
+    gen_prefix: Optional[str] = None
    metadata: Optional[dict] = (
        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
    )
@@ -369,6 +371,9 @@ class Task(abc.ABC):
    def doc_to_image(self, doc):
        raise NotImplementedError
+    def doc_to_prefix(self, doc):
+        return ""
    def build_all_requests(
        self,
        *,
@@ -398,7 +403,7 @@ class Task(abc.ABC):
        )
        cache_key += f"-tokenizer{tokenizer_name}"
-        cached_instances = load_from_cache(file_name=cache_key)
+        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
        if cache_requests and cached_instances and not rewrite_requests_cache:
            cached_instances = cached_instances[:limit]
@@ -442,6 +447,7 @@ class Task(abc.ABC):
                apply_chat_template,
                fewshot_as_multiturn,
                chat_template,
+                gen_prefix=self.doc_to_prefix(doc),
            )
            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -541,13 +547,7 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))
    @utils.positional_deprecated
-    def fewshot_context(
+    def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
-        self,
-        doc,
-        num_fewshot,
-        rnd=None,
-        description=None,
-    ):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -732,6 +732,9 @@ class ConfigurableTask(Task):
            # mark the task as requiring multimodality.
            self.MULTIMODAL = True
+        if self.config.unsafe_code is not False:
+            self.UNSAFE_CODE = True
        if self.config.dataset_path is not None:
            self.DATASET_PATH = self.config.dataset_path
@@ -1000,6 +1003,7 @@ class ConfigurableTask(Task):
        labeled_examples: List[Dict[str, str]],
        question: str,
        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
    ) -> None:
        """Adds a target question to the labeled examples list.
        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
@@ -1015,17 +1019,20 @@ class ConfigurableTask(Task):
        else:
            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
            labeled_examples.append({"role": "user", "content": question})
+        if gen_prefix:
+            labeled_examples.append({"role": "assistant", "content": gen_prefix})
    @utils.positional_deprecated
    def fewshot_context(
        self,
-        doc: str,
+        doc: dict,
        num_fewshot: int,
        system_instruction: Optional[str] = None,
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
        chat_template: Optional[Callable] = None,
-    ) -> str:
+        gen_prefix: Optional[str] = None,
+    ) -> Union[str, List[str]]:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1044,7 +1051,6 @@ class ConfigurableTask(Task):
        :returns: str
            The fewshot context.
        """
        if apply_chat_template:
            labeled_examples = []
        else:
@@ -1072,25 +1078,33 @@ class ConfigurableTask(Task):
                labeled_examples.append({"role": "system", "content": system_prompt})
            else:
                labeled_examples = system_prompt
        # if few-shot - append examples after the system prompt
        if num_fewshot > 0:
            if apply_chat_template:
                labeled_examples.extend(
                    self.sampler.get_chat_context(
-                        doc, num_fewshot, fewshot_as_multiturn
+                        doc,
+                        num_fewshot,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                )
            else:
-                labeled_examples += self.sampler.get_context(doc, num_fewshot)
+                labeled_examples += self.sampler.get_context(
+                    doc, num_fewshot, gen_prefix=gen_prefix
+                )
        example = self.doc_to_text(doc)
        if apply_chat_template:
            if self.multiple_input:
+                # TODO: append prefill?
                return chat_template(labeled_examples)
            if isinstance(example, str):
                self.append_target_question(
-                    labeled_examples, example, fewshot_as_multiturn
+                    labeled_examples,
+                    example,
+                    fewshot_as_multiturn,
+                    gen_prefix=gen_prefix,
                )
            # for loglikelihood create a list of questions with appended choices
            elif isinstance(example, list):
@@ -1098,37 +1112,62 @@ class ConfigurableTask(Task):
                # copy chat history for each example and append the answer
                for ex in example:
                    chat = deepcopy(labeled_examples)
-                    self.append_target_question(chat, ex, fewshot_as_multiturn)
+                    self.append_target_question(
-                    labeled_examples_list.append(chat_template(chat))
+                        chat,
+                        ex,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                    # TODO: append prefill?
+                    labeled_examples_list.append(
+                        chat_template(
+                            chat,
+                            add_generation_prompt=False if gen_prefix else True,
+                        )
+                    )
                return labeled_examples_list
            # if example is an integer, append the choice or convert to string
            elif isinstance(example, int):
                if self.config.doc_to_choice is not None:
                    choices = self.doc_to_choice(doc)
                    self.append_target_question(
-                        labeled_examples, choices[example], fewshot_as_multiturn
+                        labeled_examples,
+                        choices[example],
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                else:
                    self.append_target_question(
-                        labeled_examples, str(example), fewshot_as_multiturn
+                        labeled_examples,
+                        str(example),
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                # return lm.apply_chat_template(labeled_examples)
-            return chat_template(labeled_examples)
+            return chat_template(
+                labeled_examples,
+                add_generation_prompt=False if gen_prefix else True,
+            )
        else:
+            prefix = (
+                self.config.target_delimiter + gen_prefix
+                if gen_prefix is not None
+                else ""
+            )
            if self.multiple_input:
                return labeled_examples
            if isinstance(example, str):
-                return labeled_examples + example
+                return labeled_examples + example + prefix
            elif isinstance(example, list):
-                return [labeled_examples + ex for ex in example]
+                return [labeled_examples + ex + prefix for ex in example]
            elif isinstance(example, int):
                if self.config.doc_to_choice is not None:
                    choices = self.doc_to_choice(doc)
-                    return labeled_examples + choices[example]
+                    return labeled_examples + choices[example] + prefix
                else:
-                    return labeled_examples + str(example)
+                    return labeled_examples + str(example) + prefix
-    def apply_filters(self):
+    def apply_filters(self) -> Optional[List[Instance]]:
        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
@@ -1140,7 +1179,7 @@ class ConfigurableTask(Task):
    def should_decontaminate(self):
        return self.config.should_decontaminate
-    def doc_to_decontamination_query(self, doc):
+    def doc_to_decontamination_query(self, doc: dict):
        if self.config.should_decontaminate:
            if self.config.doc_to_decontamination_query is None:
                return self.doc_to_text(doc)
@@ -1299,6 +1338,14 @@ class ConfigurableTask(Task):
        else:
            return None
+    def doc_to_prefix(self, doc):
+        if (gen_prefix := self.config.gen_prefix) is not None:
+            if gen_prefix in self.features:
+                return doc[gen_prefix]
+            else:
+                return utils.apply_template(gen_prefix, doc)
+        return None
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
@@ -1503,9 +1550,9 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif (
+            # TODO: handle this better
-                type(gold) is not type(result)
+            elif type(gold) is not type(result) and not (
-                and "bypass" not in self._metric_fn_list.keys()
+                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
            ):
                # cast gold to the same type as result
                gold = type(result)(gold)
@@ -1561,7 +1608,10 @@ class ConfigurableTask(Task):
                        result_score = self._metric_fn_list[metric]([gold, result])
                    if isinstance(result_score, dict):
                        # TODO: this handles the case where HF evaluate returns a dict.
-                        result_score = result_score[metric]
+                        # This allows for multiple metrics to be returned from the same function
+                        for k, v in result_score.items():
+                            result_dict[k] = v
+                        return result_dict
                result_dict[metric] = result_score
        else:
            raise ValueError(

--- a/lm_eval/caching/cache.py
+++ b/lm_eval/caching/cache.py
@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
 FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
-def load_from_cache(file_name):
+def load_from_cache(file_name: str, cache: bool = False):
+    if not cache:
+        return
    try:
        path = f"{PATH}/{file_name}{FILE_SUFFIX}"

--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -110,12 +110,15 @@ class TextReader:
    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
-        with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
+        with (
-            total=os.path.getsize(self.file_path),
+            open(self.file_path, "r", encoding="utf-8") as fh,
-            dynamic_ncols=True,
+            tqdm.tqdm(
-            unit="byte",
+                total=os.path.getsize(self.file_path),
-            unit_scale=1,
+                dynamic_ncols=True,
-        ) as progress:
+                unit="byte",
+                unit_scale=1,
+            ) as progress,
+        ):
            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                for line in iter(mmap_obj.readline, b""):
                    line = line.decode("utf-8")

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -151,7 +151,7 @@ def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> d
            elapsed = time.perf_counter() - start
            print(f"Read took {elapsed:0.5f} seconds.")
-            print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second")
+            print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")
        print(duplicates)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -74,6 +74,7 @@ def simple_evaluate(
    numpy_random_seed: int = 1234,
    torch_random_seed: int = 1234,
    fewshot_random_seed: int = 1234,
+    confirm_run_unsafe_code: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -313,6 +314,7 @@ def simple_evaluate(
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
        verbosity=verbosity,
+        confirm_run_unsafe_code=confirm_run_unsafe_code,
    )
    if lm.rank == 0:
@@ -372,6 +374,7 @@ def evaluate(
    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
+    confirm_run_unsafe_code: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -381,6 +384,10 @@ def evaluate(
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`.
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
    :param write_out: bool
@@ -396,6 +403,10 @@ def evaluate(
        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param verbosity: str
+        Verbosity level for logging
+    :param confirm_run_unsafe_code: bool
+        Whether to confirm running tasks marked as unsafe.
    :return
        Dictionary of results
    """
@@ -422,13 +433,19 @@ def evaluate(
        ):
            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
-    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # validation checks:
+    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # 2.are we running code that is marked as unsafe.
    incompatible_tasks = []
    for task_output in eval_tasks:
        task: Task = task_output.task
        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
            incompatible_tasks.append(task_output.task_name)
+        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
+            raise ValueError(
+                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
+            )
    if len(incompatible_tasks) > 0:
        if not getattr(lm, "MULTIMODAL", False):
            raise ValueError(
@@ -438,7 +455,7 @@ def evaluate(
            raise ValueError(
                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
            )
-    # end multimodality validation check
+    # end validation check
    # Cache the limit arg.
    limit_arg = limit

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, Union
 from lm_eval.api.group import ConfigurableGroup
 from lm_eval.api.metrics import (
    aggregate_subtask_metrics,
+    mean,
    pooled_sample_stderr,
    stderr_for_metric,
 )
@@ -99,7 +100,12 @@ class TaskOutput:
    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
        for (metric, filter_key), items in self.sample_metrics.items():
-            agg_fn = self.task.aggregation()[metric]
+            try:
+                agg_fn = self.task.aggregation()[metric]
+            except KeyError:
+                # This is when process results output an arbitrary metric
+                # TODO: Handle this better and allow other aggregate functions other than mean.
+                agg_fn = mean
            metric_key = f"{metric},{filter_key}"
            self.agg_metrics[metric_key] = agg_fn(items)
            self.sample_len = len(items)  # TODO: same sample size for each metric?

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -4,7 +4,7 @@ from typing import List
 from lm_eval.api.filter import FilterEnsemble
 from lm_eval.api.registry import get_filter
-from . import extraction, selection, transformation
+from . import custom, extraction, selection, transformation
 def build_filter_ensemble(

--- a/lm_eval/filters/custom.py
+++ b/lm_eval/filters/custom.py
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("custom")
+class CustomFilter(Filter):
+    """
+    Custom filter that applies a custom, user-defined function to the model responses.
+    """
+    def __init__(self, **kwargs) -> None:
+        self.filter_fn = kwargs.pop("filter_fn")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        return self.filter_fn(resps, docs)