Merge branch 'main' into ai2d

89b6bdb3 · Baber · 59053d58 · 144a1e58 · 89b6bdb3 · 89b6bdb3
Commit 89b6bdb3 authored Feb 06, 2025 by Baber
20 changed files
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Pre-Commit
@@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 30
    steps:
    - name: Checkout Code
@@ -75,15 +75,16 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -U transformers peft
    - name: Test with pytest
      run: python -m pytest tests/models --showlocals -s -vv
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.4
+    rev: v0.9.3
    hooks:
      # Run the linter.
      - id: ruff
@@ -38,7 +38,7 @@ repos:
        # Run the formatter.
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
+    rev: v2.4.1
    hooks:
      - id: codespell
        exclude: >

--- a/README.md
+++ b/README.md
@@ -205,6 +205,19 @@ Note that it is recommended to substitute the `python` command by `torchrun --np

 Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism.

+#### Multi-GPU evaluation with OpenVINO models
+
+Pipeline parallelizm during evaluation is supported with OpenVINO models
+
+To enable  pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:<GPU index1>,<GPU index2>` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is:
+
+```
+lm_eval --model openvino \
+    --tasks wikitext \
+    --model_args pretrained=<path_to_ov_model>,pipeline_parallel=True \
+    --device HETERO:GPU.1,GPU.0
+```
+
 ### Tensor + Data Parallel and Optimized Inference with `vLLM`

 We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
@@ -257,11 +270,12 @@ Note that for externally hosted models, configs such as `--device` which relate
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
 | Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Huggingface Optimum-intel IPEX (Causal LMs)    | ✔️         | `ipex`                                          | Any decoder-only AutoModelForCausalLM                                                                                                                                                                                                                      |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
 | [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | Watsonx.ai                                                                                                                 | :heavy_check_mark:              | `watsonx_llm`                                         | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx)                                                                                                                                                                                                                                                               | `generate_until` `loglikelihood`                         |
-| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |
+| [Your local inference server!](docs/API_guide.md)                                                                                             | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |

 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.

@@ -338,11 +352,12 @@ lm_eval --model hf \

 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.

-## Saving Results
+## Saving & Caching Results

 To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.

-Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
+> [!TIP]
+> Use `--use_cache <DIR>` to cache evaluation results and skip previously evaluated samples when resuming runs of the same (model, task) pairs. Note that caching is rank-dependent, so restart with the same GPU count if interrupted. You can also use --cache_requests to save dataset preprocessing steps for faster evaluation resumption.

 To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the  HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:

@@ -478,7 +493,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | hf_transfer     | For speeding up HF Hub file downloads        |
 | ifeval          | For running the IFEval task                  |
 | ibm_watsonx_ai  | For using IBM watsonx.ai model apis          |
-
+| ipex            | For running on optimum-intel ipex backend    |
 | neuronx         | For running on AWS inf2 instances            |
 | mamba           | For loading Mamba SSM models                 |
 | math            | For running math task answer checking        |

--- a/docs/API_guide.md
+++ b/docs/API_guide.md
@@ -50,6 +50,10 @@ When initializing a `TemplateAPI` instance or a subclass, you can provide severa
   - Useful for APIs that support parallel processing.
   - Default is 1 (sequential processing).

+- `timeout` (int, optional):
+   - Timeout for API requests in seconds.
+   - Default is 30.
+
 - `tokenized_requests` (bool):
  - Determines whether the input is pre-tokenized. Defaults to `True`.
  - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1).

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -58,7 +58,7 @@ This mode supports a number of command-line arguments, the details of which can

 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.

-* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.

 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -190,7 +190,8 @@ doc_to_target: "{{answer}}"
 ```


-**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
+> [!WARNING]
+> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.


 #### Multiple choice format
@@ -206,7 +207,7 @@ doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
 ```
 Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.

-The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.

 ```yaml
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -37,6 +37,7 @@ Prompting / in-context formatting options:
 - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
+- **assistant_prefill** (`str`, *optional*) — String to append after the <|assistant|> token. For example, if the task is to generate a question, the assistant_prefill could be "The answer is: " to prompt the model to generate an answer to the question. If not using a chat template then this string will be appended to the end of the prompt.

 Runtime configuration options:
 - **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input.

--- a/examples/transformer-lens.py
+++ b/examples/transformer-lens.py
+import warnings
+
+import torch
+import torch.nn as nn
+from transformer_lens import HookedTransformer
+from transformers import AutoConfig
+
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+
+
+def evaluate_lm_eval(lens_model: HookedTransformer, tasks: list[str], **kwargs):
+    class HFLikeModelAdapter(nn.Module):
+        """Adapts HookedTransformer to match the HuggingFace interface expected by lm-eval"""
+
+        def __init__(self, model: HookedTransformer):
+            super().__init__()
+            self.model = model
+            self.tokenizer = model.tokenizer
+            self.config = AutoConfig.from_pretrained(model.cfg.tokenizer_name)
+            self.device = model.cfg.device
+            self.tie_weights = lambda: self
+
+        def forward(self, input_ids=None, attention_mask=None, **kwargs):
+            output = self.model(input_ids, attention_mask=attention_mask, **kwargs)
+            # Make sure output has the expected .logits attribute
+            if not hasattr(output, "logits"):
+                if isinstance(output, torch.Tensor):
+                    output.logits = output
+            return output
+
+        # Only delegate specific attributes we know we need
+        def to(self, *args, **kwargs):
+            return self.model.to(*args, **kwargs)
+
+        def eval(self):
+            self.model.eval()
+            return self
+
+        def train(self, mode=True):
+            self.model.train(mode)
+            return self
+
+    model = HFLikeModelAdapter(lens_model)
+    warnings.filterwarnings("ignore", message="Failed to get model SHA for")
+    results = evaluator.simple_evaluate(
+        model=HFLM(pretrained=model, tokenizer=model.tokenizer),
+        tasks=tasks,
+        verbosity="WARNING",
+        **kwargs,
+    )
+    return results
+
+
+if __name__ == "__main__":
+    # Load base model
+    model = HookedTransformer.from_pretrained("pythia-70m")
+    res = evaluate_lm_eval(model, tasks=["arc_easy"])
+    print(res["results"])
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -257,6 +257,11 @@ def setup_parser() -> argparse.ArgumentParser:
        action="store_true",
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
+    parser.add_argument(
+        "--confirm_run_unsafe_code",
+        action="store_true",
+        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
+    )
    return parser


@@ -404,6 +409,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        numpy_random_seed=args.seed[1],
        torch_random_seed=args.seed[2],
        fewshot_random_seed=args.seed[3],
+        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
        **request_caching_args,
    )


--- a/lm_eval/api/group.py
+++ b/lm_eval/api/group.py
@@ -112,6 +112,4 @@ class ConfigurableGroup(abc.ABC):
        return self._config.group

    def __repr__(self):
-        return (
-            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
-        )
+        return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -527,9 +527,9 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):


 def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
-    assert (
-        metrics is not None
-    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    assert metrics is not None, (
+        "Need to pass a list of each subtask's metric for this stderr aggregation"
+    )
    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)

    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -113,13 +113,17 @@ class LM(abc.ABC):
        """
        pass

-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+    ) -> str:
        """
        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.

        :param chat_history: list[dict[str, str]]
            A list of dictionaries with keys 'role' and 'content'.
            Values are strings representing the role name and the content of the message, respectively.
+        :param add_generation_prompt: bool
+            Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
        :return: str
            A string representing the chat history in a format that can be used as input to the LM.
        """

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -17,13 +17,13 @@ def register_model(*names):

    def decorate(cls):
        for name in names:
-            assert issubclass(
-                cls, LM
-            ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+            assert issubclass(cls, LM), (
+                f"Model '{name}' ({cls.__name__}) must extend LM class"
+            )

-            assert (
-                name not in MODEL_REGISTRY
-            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            assert name not in MODEL_REGISTRY, (
+                f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            )

            MODEL_REGISTRY[name] = cls
        return cls
@@ -48,9 +48,9 @@ func2task_index = {}

 def register_task(name):
    def decorate(fn):
-        assert (
-            name not in TASK_REGISTRY
-        ), f"task named '{name}' conflicts with existing registered task!"
+        assert name not in TASK_REGISTRY, (
+            f"task named '{name}' conflicts with existing registered task!"
+        )

        TASK_REGISTRY[name] = fn
        ALL_TASKS.add(name)
@@ -104,9 +104,9 @@ def register_metric(**args):
        ]:
            if key in args:
                value = args[key]
-                assert (
-                    value not in registry
-                ), f"{key} named '{value}' conflicts with existing registered {key}!"
+                assert value not in registry, (
+                    f"{key} named '{value}' conflicts with existing registered {key}!"
+                )

                if key == "metric":
                    registry[name] = fn
@@ -140,9 +140,9 @@ def get_metric(name: str, hf_evaluate_metric=False) -> Callable:

 def register_aggregation(name: str):
    def decorate(fn):
-        assert (
-            name not in AGGREGATION_REGISTRY
-        ), f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        assert name not in AGGREGATION_REGISTRY, (
+            f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        )

        AGGREGATION_REGISTRY[name] = fn
        return fn

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 from functools import partial
+from typing import TYPE_CHECKING, Iterable, Optional, Union

 import datasets


+if TYPE_CHECKING:
+    from random import Random
+
+    from lm_eval.api.task import ConfigurableTask, Task
+
+
 class ContextSampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+    def __init__(
+        self,
+        docs: list[dict],
+        task: Union["Task", "ConfigurableTask"],
+        fewshot_indices: Optional[Iterable] = None,
+        rnd: Optional["Random"] = None,
+    ) -> None:
        self.rnd = rnd
        if not self.rnd:
            raise ValueError(
@@ -58,8 +71,9 @@ class ContextSampler:
                )
            self.docs = self.docs.select(fewshot_indices)

-    def get_context(self, doc, num_fewshot):
+    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
        # draw an extra fewshot sample if using same split as evaluating on
+        prefix = gen_prefix + " " if gen_prefix else ""
        n_samples = (
            num_fewshot + 1
            if self.config.fewshot_split == self.config.test_split
@@ -77,14 +91,14 @@ class ContextSampler:
        for doc in selected_docs:
            doc_content = self.doc_to_text(doc)
            doc_target = self.doc_to_target(doc)
-            labeled_examples += (
-                doc_content
-                if self.config.doc_to_choice is None or isinstance(doc_content, str)
-                else self.doc_to_choice(doc)[doc_content]
-            )
+            if self.config.doc_to_choice is None or isinstance(doc_content, str):
+                labeled_examples += doc_content
+            else:
+                labeled_examples += self.doc_to_choice(doc)[doc_content]

            if doc_target != "":
                labeled_examples += self.target_delimiter
+                labeled_examples += prefix
                labeled_examples += (
                    str(doc_target[0])
                    if isinstance(doc_target, list)
@@ -98,10 +112,13 @@ class ContextSampler:

    def get_chat_context(
        self,
-        doc,
-        num_fewshot,
+        doc: dict,
+        num_fewshot: int,
        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
    ):
+        # TODO: Do we need any other delimiter
+        prefix = gen_prefix + " " if gen_prefix else ""
        chat_history = []
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
@@ -132,23 +149,28 @@ class ContextSampler:
                chat_history.append(
                    {
                        "role": "assistant",
-                        "content": str(doc_target[0])
+                        "content": prefix + str(doc_target[0])
                        if isinstance(doc_target, list)
-                        else doc_target
+                        else prefix + doc_target
                        if self.config.doc_to_choice is None
                        or isinstance(doc_target, str)
-                        else str(self.doc_to_choice(doc)[doc_target]),
+                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
                    }
                )
        else:
            # get fewshot context as one user turn
            chat_history.append(
-                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+                {
+                    "role": "user",
+                    "content": self.get_context(
+                        doc, num_fewshot, gen_prefix=gen_prefix
+                    ),
+                }
            )

        return chat_history

-    def sample(self, n):
+    def sample(self, n: int):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
        """
@@ -157,19 +179,19 @@ class ContextSampler:


 class FirstNSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """
        Draw the first `n` samples in order from the specified split.
        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
        """
-        assert (
-            n <= len(self.docs)
-        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        assert n <= len(self.docs), (
+            f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        )
        return self.docs[:n]


 class BalancedSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -179,7 +201,7 @@ class BalancedSampler(ContextSampler):


 class ManualSampler(ContextSampler):
-    def sample(self, n) -> None:
+    def sample(self, n: int) -> None:
        """ """
        pass

@@ -190,7 +212,7 @@ SAMPLER_REGISTRY = {
 }


-def get_sampler(name):
+def get_sampler(name: str):
    try:
        return SAMPLER_REGISTRY[name]
    except KeyError:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -75,6 +75,7 @@ class TaskConfig(dict):
    doc_to_text: Optional[Union[Callable, str]] = None
    doc_to_target: Optional[Union[Callable, str]] = None
    doc_to_image: Union[Callable, str] = None
+    unsafe_code: bool = False
    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
    process_results: Optional[Union[Callable, str]] = None
    use_prompt: Optional[str] = None
@@ -92,6 +93,7 @@ class TaskConfig(dict):
    filter_list: Optional[Union[str, list]] = None
    should_decontaminate: bool = False
    doc_to_decontamination_query: Optional[str] = None
+    gen_prefix: Optional[str] = None
    metadata: Optional[dict] = (
        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
    )
@@ -369,6 +371,9 @@ class Task(abc.ABC):
    def doc_to_image(self, doc):
        raise NotImplementedError

+    def doc_to_prefix(self, doc):
+        return ""
+
    def build_all_requests(
        self,
        *,
@@ -398,7 +403,7 @@ class Task(abc.ABC):
        )
        cache_key += f"-tokenizer{tokenizer_name}"

-        cached_instances = load_from_cache(file_name=cache_key)
+        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)

        if cache_requests and cached_instances and not rewrite_requests_cache:
            cached_instances = cached_instances[:limit]
@@ -442,6 +447,7 @@ class Task(abc.ABC):
                apply_chat_template,
                fewshot_as_multiturn,
                chat_template,
+                gen_prefix=self.doc_to_prefix(doc),
            )

            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -450,6 +456,7 @@ class Task(abc.ABC):
                ctx=fewshot_ctx,
                metadata=(self.config["task"], doc_id, self.config.repeats),
                apply_chat_template=apply_chat_template,
+                chat_template=chat_template,
            )

            if not isinstance(inst, list):
@@ -541,13 +548,7 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))

    @utils.positional_deprecated
-    def fewshot_context(
-        self,
-        doc,
-        num_fewshot,
-        rnd=None,
-        description=None,
-    ):
+    def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -732,6 +733,9 @@ class ConfigurableTask(Task):
            # mark the task as requiring multimodality.
            self.MULTIMODAL = True

+        if self.config.unsafe_code is not False:
+            self.UNSAFE_CODE = True
+
        if self.config.dataset_path is not None:
            self.DATASET_PATH = self.config.dataset_path

@@ -1000,6 +1004,7 @@ class ConfigurableTask(Task):
        labeled_examples: List[Dict[str, str]],
        question: str,
        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
    ) -> None:
        """Adds a target question to the labeled examples list.
        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
@@ -1015,17 +1020,20 @@ class ConfigurableTask(Task):
        else:
            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
            labeled_examples.append({"role": "user", "content": question})
+        if gen_prefix:
+            labeled_examples.append({"role": "assistant", "content": gen_prefix})

    @utils.positional_deprecated
    def fewshot_context(
        self,
-        doc: str,
+        doc: dict,
        num_fewshot: int,
        system_instruction: Optional[str] = None,
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
        chat_template: Optional[Callable] = None,
-    ) -> str:
+        gen_prefix: Optional[str] = None,
+    ) -> Union[str, List[str]]:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -1044,7 +1052,6 @@ class ConfigurableTask(Task):
        :returns: str
            The fewshot context.
        """
-
        if apply_chat_template:
            labeled_examples = []
        else:
@@ -1072,25 +1079,35 @@ class ConfigurableTask(Task):
                labeled_examples.append({"role": "system", "content": system_prompt})
            else:
                labeled_examples = system_prompt
-
        # if few-shot - append examples after the system prompt
        if num_fewshot > 0:
            if apply_chat_template:
                labeled_examples.extend(
                    self.sampler.get_chat_context(
-                        doc, num_fewshot, fewshot_as_multiturn
+                        doc,
+                        num_fewshot,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                )
            else:
-                labeled_examples += self.sampler.get_context(doc, num_fewshot)
+                labeled_examples += self.sampler.get_context(
+                    doc, num_fewshot, gen_prefix=gen_prefix
+                )

        example = self.doc_to_text(doc)
        if apply_chat_template:
            if self.multiple_input:
+                # TODO: append prefill?
+                if not labeled_examples:
+                    return ""
                return chat_template(labeled_examples)
            if isinstance(example, str):
                self.append_target_question(
-                    labeled_examples, example, fewshot_as_multiturn
+                    labeled_examples,
+                    example,
+                    fewshot_as_multiturn,
+                    gen_prefix=gen_prefix,
                )
            # for loglikelihood create a list of questions with appended choices
            elif isinstance(example, list):
@@ -1098,37 +1115,62 @@ class ConfigurableTask(Task):
                # copy chat history for each example and append the answer
                for ex in example:
                    chat = deepcopy(labeled_examples)
-                    self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(chat_template(chat))
+                    self.append_target_question(
+                        chat,
+                        ex,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                    # TODO: append prefill?
+                    labeled_examples_list.append(
+                        chat_template(
+                            chat,
+                            add_generation_prompt=False if gen_prefix else True,
+                        )
+                    )
                return labeled_examples_list
            # if example is an integer, append the choice or convert to string
            elif isinstance(example, int):
                if self.config.doc_to_choice is not None:
                    choices = self.doc_to_choice(doc)
                    self.append_target_question(
-                        labeled_examples, choices[example], fewshot_as_multiturn
+                        labeled_examples,
+                        choices[example],
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                else:
                    self.append_target_question(
-                        labeled_examples, str(example), fewshot_as_multiturn
+                        labeled_examples,
+                        str(example),
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
                    )
                # return lm.apply_chat_template(labeled_examples)
-            return chat_template(labeled_examples)
+            return chat_template(
+                labeled_examples,
+                add_generation_prompt=False if gen_prefix else True,
+            )
        else:
+            prefix = (
+                self.config.target_delimiter + gen_prefix
+                if gen_prefix is not None
+                else ""
+            )
            if self.multiple_input:
                return labeled_examples
            if isinstance(example, str):
-                return labeled_examples + example
+                return labeled_examples + example + prefix
            elif isinstance(example, list):
-                return [labeled_examples + ex for ex in example]
+                return [labeled_examples + ex + prefix for ex in example]
            elif isinstance(example, int):
                if self.config.doc_to_choice is not None:
                    choices = self.doc_to_choice(doc)
-                    return labeled_examples + choices[example]
+                    return labeled_examples + choices[example] + prefix
                else:
-                    return labeled_examples + str(example)
+                    return labeled_examples + str(example) + prefix

-    def apply_filters(self):
+    def apply_filters(self) -> Optional[List[Instance]]:
        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
@@ -1140,7 +1182,7 @@ class ConfigurableTask(Task):
    def should_decontaminate(self):
        return self.config.should_decontaminate

-    def doc_to_decontamination_query(self, doc):
+    def doc_to_decontamination_query(self, doc: dict):
        if self.config.should_decontaminate:
            if self.config.doc_to_decontamination_query is None:
                return self.doc_to_text(doc)
@@ -1299,10 +1341,19 @@ class ConfigurableTask(Task):
        else:
            return None

+    def doc_to_prefix(self, doc):
+        if (gen_prefix := self.config.gen_prefix) is not None:
+            if gen_prefix in self.features:
+                return doc[gen_prefix]
+            else:
+                return utils.apply_template(gen_prefix, doc)
+        return None
+
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
        apply_chat_template = kwargs.pop("apply_chat_template", False)
+        chat_template: Callable | None = kwargs.pop("chat_template", None)

        aux_arguments = None

@@ -1317,9 +1368,20 @@ class ConfigurableTask(Task):
                target_delimiter = ""
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
+                # apply chat_template to choices if apply_chat_template
                cont = self.doc_to_target(doc)
+
                arguments = [
-                    (ctx + choice, f"{target_delimiter}{cont}") for choice in choices
+                    (
+                        ctx
+                        + (
+                            chat_template([{"role": "user", "content": choice}])
+                            if apply_chat_template
+                            else choice
+                        ),
+                        f"{target_delimiter}{cont}",
+                    )
+                    for choice in choices
                ]
            else:
                # Otherwise they are placed in the continuation
@@ -1503,9 +1565,9 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif (
-                type(gold) is not type(result)
-                and "bypass" not in self._metric_fn_list.keys()
+            # TODO: handle this better
+            elif type(gold) is not type(result) and not (
+                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
            ):
                # cast gold to the same type as result
                gold = type(result)(gold)
@@ -1559,10 +1621,13 @@ class ConfigurableTask(Task):
                        )
                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                        result_score = self._metric_fn_list[metric]([gold, result])
-                    if isinstance(result_score, dict):
-                        # TODO: this handles the case where HF evaluate returns a dict.
-                        result_score = result_score[metric]
-                result_dict[metric] = result_score
+                if isinstance(result_score, dict):
+                    # TODO: this handles the case where HF evaluate returns a dict.
+                    # This allows for multiple metrics to be returned from the same function
+                    for k, v in result_score.items():
+                        result_dict[k] = v
+                else:
+                    result_dict[metric] = result_score
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",

--- a/lm_eval/caching/cache.py
+++ b/lm_eval/caching/cache.py
@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
 FILE_SUFFIX = f".{HASH_PREFIX}.pickle"


-def load_from_cache(file_name):
+def load_from_cache(file_name: str, cache: bool = False):
+    if not cache:
+        return
    try:
        path = f"{PATH}/{file_name}{FILE_SUFFIX}"


--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -110,12 +110,15 @@ class TextReader:
    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
-        with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
-            total=os.path.getsize(self.file_path),
-            dynamic_ncols=True,
-            unit="byte",
-            unit_scale=1,
-        ) as progress:
+        with (
+            open(self.file_path, "r", encoding="utf-8") as fh,
+            tqdm.tqdm(
+                total=os.path.getsize(self.file_path),
+                dynamic_ncols=True,
+                unit="byte",
+                unit_scale=1,
+            ) as progress,
+        ):
            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                for line in iter(mmap_obj.readline, b""):
                    line = line.decode("utf-8")

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -151,7 +151,7 @@ def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> d

            elapsed = time.perf_counter() - start
            print(f"Read took {elapsed:0.5f} seconds.")
-            print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second")
+            print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")

        print(duplicates)


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -74,6 +74,7 @@ def simple_evaluate(
    numpy_random_seed: int = 1234,
    torch_random_seed: int = 1234,
    fewshot_random_seed: int = 1234,
+    confirm_run_unsafe_code: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -313,6 +314,7 @@ def simple_evaluate(
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
        verbosity=verbosity,
+        confirm_run_unsafe_code=confirm_run_unsafe_code,
    )

    if lm.rank == 0:
@@ -372,6 +374,7 @@ def evaluate(
    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
+    confirm_run_unsafe_code: bool = False,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -381,6 +384,10 @@ def evaluate(
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`.
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
    :param write_out: bool
@@ -396,6 +403,10 @@ def evaluate(
        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param verbosity: str
+        Verbosity level for logging
+    :param confirm_run_unsafe_code: bool
+        Whether to confirm running tasks marked as unsafe.
    :return
        Dictionary of results
    """
@@ -422,13 +433,19 @@ def evaluate(
        ):
            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")

-    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # validation checks:
+    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # 2.are we running code that is marked as unsafe.
    incompatible_tasks = []
    for task_output in eval_tasks:
        task: Task = task_output.task

        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
            incompatible_tasks.append(task_output.task_name)
+        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
+            raise ValueError(
+                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
+            )
    if len(incompatible_tasks) > 0:
        if not getattr(lm, "MULTIMODAL", False):
            raise ValueError(
@@ -438,7 +455,7 @@ def evaluate(
            raise ValueError(
                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
            )
-    # end multimodality validation check
+    # end validation check

    # Cache the limit arg.
    limit_arg = limit

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, Union
 from lm_eval.api.group import ConfigurableGroup
 from lm_eval.api.metrics import (
    aggregate_subtask_metrics,
+    mean,
    pooled_sample_stderr,
    stderr_for_metric,
 )
@@ -99,7 +100,12 @@ class TaskOutput:

    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
        for (metric, filter_key), items in self.sample_metrics.items():
-            agg_fn = self.task.aggregation()[metric]
+            try:
+                agg_fn = self.task.aggregation()[metric]
+            except KeyError:
+                # This is when process results output an arbitrary metric
+                # TODO: Handle this better and allow other aggregate functions other than mean.
+                agg_fn = mean
            metric_key = f"{metric},{filter_key}"
            self.agg_metrics[metric_key] = agg_fn(items)
            self.sample_len = len(items)  # TODO: same sample size for each metric?