Merge branch 'main' into feature/eval_from_config

84d02f77 · Baber · 15ce554c · fcddf195 · 15ce554c · 15ce554c
Commit 84d02f77 authored Jul 10, 2025 by Baber
20 changed files
--- a/.coveragerc
+++ b/.coveragerc
-[run]
-
-# tasks that aren't wired up.
-omit =
-    lm_eval/tasks/quac.py
-    lm_eval/tasks/storycloze.py
-    lm_eval/tasks/cbt.py
-    lm_eval/tasks/sat.py
-    lm_eval/tasks/triviaqa.py
-    lm_eval/tasks/naturalqs.py
-    lm_eval/models/dummy.py
-
-[report]
-exclude_lines =
-    # Skip any pass lines such as may be used for @abstractmethod
-    pass
-
-    # Have to re-enable the standard pragma
-    pragma: no cover
-
-    # Don't complain about missing debug-only code:
-    def __repr__
-    if self\.debug
-
-    # Don't complain if tests don't hit defensive assertion code:
-    raise AssertionError
-    raise NotImplementedError
-    return NotImplemented
--- a/.flake8
+++ b/.flake8
-[flake8]
-ignore = E203, E266, E501, W503, F403, F401, C901
-max-line-length = 127
-max-complexity = 10
-select = B,C,E,F,W,T4,B9
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -50,12 +50,12 @@ jobs:
        with:
          python-version: 3.9
          cache: 'pip'
-          cache-dependency-path: setup.py
+          cache-dependency-path: pyproject.toml
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
    #   Install optional git dependencies
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -53,7 +53,7 @@ jobs:

      # Cache HuggingFace cache directory for CPU tests
      - name: Cache HuggingFace cache (CPU tests)
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: cache-hf-cpu
        with:
          path: ~/.cache/huggingface
@@ -64,11 +64,11 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
          pip install hf_xet

      - name: Test with pytest
-        run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
+        run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
        continue-on-error: true  # Continue workflow even if tests fail

      # Save test artifacts
@@ -106,7 +106,7 @@ jobs:
 #      - name: Install dependencies
 #        run: |
 #          python -m pip install --upgrade pip
-#          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+#          pip install -e '.[dev,optimum,api]' --extra-index-url https://download.pytorch.org/whl/cpu
 #          pip install -U transformers peft accelerate
 #
 #      - name: Test with pytest

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.10
+    rev: v0.12.2
    hooks:
      # Run the linter.
      - id: ruff
@@ -47,7 +47,7 @@ repos:
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.29
+    rev: v0.9.30
    hooks:
      - id: pymarkdown
        exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$

--- a/README.md
+++ b/README.md
@@ -364,7 +364,7 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-
 Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.

 | API or Inference Server                                                                                                   | Implemented?                                                                                            | `--model <xxx>` name                                | Models supported:                                                                                                                                                                                                                                                                                                                                          | Request Types:                                                                 |
-| --------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
 | OpenAI Completions                                                                                                        | :heavy_check_mark:                                                                                      | `openai-completions`, `local-completions`           | All OpenAI Completions API models                                                                                                                                                                                                                                                                                                                          | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
 | OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:                                                                                      | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                                                                                                                                                                                                                                                                              | `generate_until` (no logprobs)                                                 |
 | Anthropic                                                                                                                 | :heavy_check_mark:                                                                                      | `anthropic`                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)                                                                                                                                                                                                                                                               | `generate_until` (no logprobs)                                                 |
@@ -377,8 +377,6 @@ Note that for externally hosted models, configs such as `--device` which relate
 | Huggingface Optimum (Causal LMs)                                                                                          | :heavy_check_mark:                                                                                      | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
 | Huggingface Optimum-intel IPEX (Causal LMs)                                                                               | :heavy_check_mark:                                                                                      | `ipex`                                              | Any decoder-only AutoModelForCausalLM                                                                                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
 | Neuron via AWS Inf2 (Causal LMs)                                                                                          | :heavy_check_mark:                                                                                      | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)                                                      | :heavy_check_mark:                                                                                      | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       | `generate_until`, `loglikelihood`                                              |
-| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)                                                          | :heavy_check_mark:                                                                                      | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
 | NVIDIA NeMo                                                                                                               | :heavy_check_mark:                                                                                      | `nemo_lm`                                           | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models)                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
 | Watsonx.ai                                                                                                                | :heavy_check_mark:                                                                                      | `watsonx_llm`                                       | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx)                                                                                                                                                                                                                                 | `generate_until` `loglikelihood`                                               |
 | [Your local inference server!](docs/API_guide.md)                                                                         | :heavy_check_mark:                                                                                      | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
@@ -572,9 +570,19 @@ lm_eval \

 In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI.

-## How to Contribute or Learn More?
+## Contributing

-For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
+Check out our [open issues](https://github.com/EleutherAI/lm-evaluation-harness/issues) and feel free to submit pull requests!
+
+For more information on the library and how everything fits together, see our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs).
+
+To get started with development, first clone the repository and install the dev dependencies:
+
+```bash
+git clone https://github.com/EleutherAI/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install -e ".[dev]"
+````

 ### Implementing new tasks

@@ -599,37 +607,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth

 Extras dependencies can be installed via `pip install -e ".[NAME]"`

-| Name                 | Use                                                   |
-| -------------------- | ----------------------------------------------------- |
-| api                  | For using api models (Anthropic, OpenAI API)          |
-| audiolm_qwen         | For running Qwen2 audio models                        |
-| deepsparse           | For running NM's DeepSparse models                    |
-| dev                  | For linting PRs and contributions                     |
-| gptq                 | For loading models with AutoGPTQ                      |
-| gptqmodel            | For loading models with GPTQModel                     |
-| hf_transfer          | For speeding up HF Hub file downloads                 |
-| ibm_watsonx_ai       | For using IBM watsonx.ai model apis                   |
-| ifeval               | For running the IFEval task                           |
-| ipex                 | For running on optimum-intel ipex backend             |
-| japanese_leaderboard | For running Japanese LLM Leaderboard tasks            |
-| longbench            | For running LongBench tasks                           |
-| mamba                | For loading Mamba SSM models                          |
-| math                 | For running math task answer checking                 |
-| multilingual         | For multilingual tokenizers                           |
-| neuronx              | For running on AWS inf2 instances                     |
-| optimum              | For running Intel OpenVINO models                     |
-| promptsource         | For using PromptSource prompts                        |
-| ruler                | For running RULER tasks                               |
-| sae_lens             | For using SAELens to steer models                     |
-| sentencepiece        | For using the sentencepiece tokenizer                 |
-| sparseml             | For using NM's SparseML models                        |
-| sparsify             | For using Sparsify to steer models                    |
-| testing              | For running library test suite                        |
-| vllm                 | For loading models with vLLM                          |
-| wandb                | For integration with `Weights and Biases` platform    |
-| zeno                 | For visualizing results with Zeno                     |
-| -------------------- | ----------------------------------------------------- |
-| all                  | Loads all extras (not recommended)                    |
+| NAME                 | Description                    | NAME           | Description                           |
+|----------------------|--------------------------------|----------------|---------------------------------------|
+| tasks                | All task-specific dependencies | api            | API models (Anthropic, OpenAI, local) |
+| acpbench             | ACP Bench tasks                | audiolm_qwen   | Qwen2 audio models                    |
+| ifeval               | IFEval task                    |                |                                       |
+| japanese_leaderboard | Japanese LLM tasks             | gptq           | AutoGPTQ models                       |
+| longbench            | LongBench tasks                | gptqmodel      | GPTQModel models                      |
+| math                 | Math answer checking           | hf_transfer    | Speed up HF downloads                 |
+| multilingual         | Multilingual tokenizers        | ibm_watsonx_ai | IBM watsonx.ai models                 |
+| ruler                | RULER tasks                    | ipex           | Intel IPEX backend                    |
+|                      |                                |                |                                       |
+| dev                  | Linting & contributions        | mamba          | Mamba SSM models                      |
+| promptsource         | PromptSource prompts           | neuronx        | AWS inf2 instances                    |
+| sentencepiece        | Sentencepiece tokenizer        | optimum        | Intel OpenVINO models                 |
+| testing              | Run test suite                 | sae_lens       | SAELens model steering                |
+| unitxt               | Run unitxt tasks               |                |                                       |
+| wandb                | Weights & Biases               | sparsify       | Sparsify model steering               |
+| zeno                 | Result visualization           | vllm           | vLLM models                           |

 ## Cite as


--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -30,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing.
 We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:

 ```bash
-python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py
 ```

 ## Contributor License Agreement

--- a/lm_eval/_cli/run.py
+++ b/lm_eval/_cli/run.py
@@ -8,6 +8,8 @@ from functools import partial
 from lm_eval._cli.subcommand import SubCommand
 from lm_eval._cli.utils import (
    _int_or_none_list_arg_type,
+    key_val_to_dict,
+    merge_dicts,
    request_caching_arg_to_dict,
    try_parse_json,
 )
@@ -26,13 +28,13 @@ class Run(SubCommand):
            epilog=textwrap.dedent("""
                examples:
                  # Basic evaluation with HuggingFace model
-                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag

                  # Evaluate on multiple tasks with few-shot examples
-                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
+                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5

                  # Evaluation with custom generation parameters
-                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95

                  # Use configuration file
                  $ lm-eval run --config my_config.yaml --tasks mmlu
@@ -73,9 +75,10 @@ class Run(SubCommand):
            "-t",
            default=None,
            type=str,
-            metavar="TASK1,TASK2",
+            nargs="*",
+            metavar="TASK1 TASK2",
            help=textwrap.dedent("""
-                Comma-separated list of task names or groupings.
+                Space or Comma-separated list of task names or groupings.
                Use 'lm-eval list tasks' to see all available tasks.
            """).strip(),
        )
@@ -83,9 +86,10 @@ class Run(SubCommand):
            "--model_args",
            "-a",
            default=None,
-            type=try_parse_json,
+            nargs="*",
+            type=key_val_to_dict,
            metavar="ARGS",
-            help="Model arguments as 'key=val,key2=val2' or JSON string",
+            help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
        )

        # Evaluation Settings
@@ -124,10 +128,13 @@ class Run(SubCommand):
        )
        eval_group.add_argument(
            "--gen_kwargs",
-            type=try_parse_json,
+            type=key_val_to_dict,
            default=None,
+            nargs="*",
            metavar="KWARGS",
-            help="Generation arguments as 'key=val,key2=val2' or JSON string",
+            help=textwrap.dedent(
+                'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.'
+            ),
        )

        # Data and Output
@@ -250,24 +257,24 @@ class Run(SubCommand):
        )
        logging_group.add_argument(
            "--wandb_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Weights & Biases init arguments (key=val,key2=val2)",
+            help="Weights & Biases init arguments key=val key2=val2",
        )
        logging_group.add_argument(
            "--wandb_config_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Weights & Biases config arguments (key=val,key2=val2)",
+            help="Weights & Biases config arguments key=val key2=val2",
        )
        logging_group.add_argument(
            "--hf_hub_log_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Hugging Face Hub logging arguments (key=val,key2=val2)",
+            help="Hugging Face Hub logging arguments key=val key2=val2",
        )

        # Advanced Options
@@ -313,9 +320,21 @@ class Run(SubCommand):
            ),
        )

-    def _execute(self, args: argparse.Namespace) -> None:
+    @staticmethod
+    def _execute(args: argparse.Namespace) -> None:
        """Runs the evaluation harness with the provided arguments."""
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        MERGE_ARGS_DICTS = [
+            "model_args",
+            "gen_kwargs",
+            "wandb_args",
+            "wandb_config_args",
+            "hf_hub_log_args",
+        ]
+        for arg_name in MERGE_ARGS_DICTS:
+            if current_value := getattr(args, arg_name, None):
+                setattr(args, arg_name, merge_dicts(*current_value))
+
        from lm_eval.config.evaluate_config import EvaluatorConfig

        eval_logger = logging.getLogger(__name__)

--- a/lm_eval/_cli/utils.py
+++ b/lm_eval/_cli/utils.py
 import argparse
+import ast
 import json
 import logging
-from typing import Optional, Union
+from typing import Any, Optional, Union


 def try_parse_json(value: Union[str, dict, None]) -> Union[str, dict, None]:
@@ -81,3 +82,35 @@ def check_argument_types(parser: argparse.ArgumentParser) -> None:
            raise ValueError(f"Argument '{action.dest}' doesn't have a type specified.")
        else:
            continue
+
+
+def handle_cli_value_string(arg: str) -> Any:
+    if arg.lower() == "true":
+        return True
+    elif arg.lower() == "false":
+        return False
+    elif arg.isnumeric():
+        return int(arg)
+    try:
+        return float(arg)
+    except ValueError:
+        try:
+            return ast.literal_eval(arg)
+        except (ValueError, SyntaxError):
+            return arg
+
+
+def key_val_to_dict(args: str) -> dict:
+    """Parse model arguments from a string into a dictionary."""
+    return (
+        {
+            k: handle_cli_value_string(v)
+            for k, v in (item.split("=") for item in args.split(","))
+        }
+        if args
+        else {}
+    )
+
+
+def merge_dicts(*dicts):
+    return {k: v for d in dicts for k, v in d.items()}
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
 import logging
 import math
+import os
 import random
 import re
 import string
 from collections.abc import Iterable
-from typing import List
+from typing import Callable, List, Optional, Sequence, TypeVar

 import numpy as np
 import sacrebleu
@@ -12,6 +13,8 @@ import sacrebleu
 from lm_eval.api.registry import register_aggregation, register_metric


+T = TypeVar("T")
+
 eval_logger = logging.getLogger(__name__)


@@ -287,7 +290,7 @@ def pop_stddev(arr):
    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))


-def sample_stddev(arr):
+def sample_stddev(arr: Sequence[T]) -> float:
    mu = mean(arr)
    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))

@@ -449,11 +452,16 @@ def _sacreformat(refs, preds):


 class _bootstrap_internal:
-    def __init__(self, f, n) -> None:
+    """
+    Pool worker: `(i, xs)` → `n` bootstrap replicates
+    of `f(xs)`using a RNG seeded with `i`.
+    """
+
+    def __init__(self, f: Callable[[Sequence[T]], float], n: int) -> None:
        self.f = f
        self.n = n

-    def __call__(self, v):
+    def __call__(self, v: tuple[int, Sequence[T]]) -> list[float]:
        i, xs = v
        rnd = random.Random()
        rnd.seed(i)
@@ -463,36 +471,81 @@ class _bootstrap_internal:
        return res


-def bootstrap_stderr(f, xs, iters):
-    import multiprocessing as mp
-
-    pool = mp.Pool(mp.cpu_count())
-    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
-    # equivalent to stderr calculated without Bessel's correction in the stddev.
-    # Unfortunately, I haven't been able to figure out what the right correction is
-    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
-    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
-    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+def _bootstrap_internal_no_mp(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> list[float]:
+    """
+    Single-process fallback: compute `iters` bootstrap replicates
+    of statistic`f(xs)`, chunked (≤ 1000 draws).
+    """
    res = []
    chunk_size = min(1000, iters)
    from tqdm import tqdm

-    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(
-        pool.imap(
-            _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)],
-        ),
-        total=iters // chunk_size,
-    ):
-        # sample w replacement
-        res.extend(bootstrap)
-
-    pool.close()
+    print(f"bootstrapping for stddev: {f.__name__}")
+
+    # A single loop replaces the multiprocessing pool.
+    for i in tqdm(range(iters // chunk_size)):
+        rnd = random.Random(i)
+        for _ in range(chunk_size):
+            res.append(f(rnd.choices(xs, k=len(xs))))
+
+    return res
+
+
+def bootstrap_stderr(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> float:
+    """
+    Bootstrap estimate of the standard error of statistic `f(xs)`
+    using up to `iters` resamples, chunked (≤ 1000 draws)
+
+    Executes in parallel unless the env-var `DISABLE_MULTIPROC` is set;
+    """
+    if not os.getenv("DISABLE_MULTIPROC"):
+        import multiprocessing as mp
+
+        pool = mp.Pool(mp.cpu_count())
+        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+        # equivalent to stderr calculated without Bessel's correction in the stddev.
+        # Unfortunately, I haven't been able to figure out what the right correction is
+        # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+        # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+        # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+        res = []
+        chunk_size = min(1000, iters)
+        from tqdm import tqdm
+
+        print("bootstrapping for stddev:", f.__name__)
+        for bootstrap in tqdm(
+            pool.imap(
+                _bootstrap_internal(f, chunk_size),
+                [(i, xs) for i in range(iters // chunk_size)],
+            ),
+            total=iters // chunk_size,
+        ):
+            # sample w replacement
+            res.extend(bootstrap)
+
+        pool.close()
+    else:
+        res = _bootstrap_internal_no_mp(f, xs, iters)
+
    return sample_stddev(res)


-def stderr_for_metric(metric, bootstrap_iters: int):
+def stderr_for_metric(
+    metric: Callable[[Sequence[T]], float], bootstrap_iters: int
+) -> Optional[Callable[[Sequence[T]], float]]:
+    """
+    Return a function that estimates the standard error of `metric(xs)`.
+
+    * If `bootstrap_iters > 0` and the metric is in the pre-approved
+      bootstrappable list, use `bootstrap_stderr` with that many draws.
+    * If the metric has a closed-form SE (e.g. `mean`, `acc_all`), use it.
+    * Otherwise, return `None`.
+    """
+
    if bootstrap_iters <= 0:
        # return no function (don't compute stderr) if bootstrap iters = 0
        return None

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -3,15 +3,19 @@ import hashlib
 import json
 import logging
 import os
-from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union

-import transformers
-from sqlitedict import SqliteDict
 from tqdm import tqdm

 from lm_eval import utils


+if TYPE_CHECKING:
+    from sqlitedict import SqliteDict
+
+    from lm_eval.api.instance import Instance
+
+
 eval_logger = logging.getLogger(__name__)

 T = TypeVar("T", bound="LM")
@@ -27,10 +31,10 @@ class LM(abc.ABC):
        # set rank and world size to a single process, by default.
        self._rank = 0
        self._world_size = 1
-        self.cache_hook = CacheHook(None)
+        self.cache_hook: "CacheHook" = CacheHook(None)

    @abc.abstractmethod
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood(self, requests) -> list[tuple[float, bool]]:
        """Compute log-likelihood of generating a continuation from a context.
        Downstream tasks should attempt to use loglikelihood instead of other
        LM calls whenever possible.
@@ -55,7 +59,7 @@ class LM(abc.ABC):
        pass

    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(self, requests) -> list[float]:
        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
        - We will use the full max context length of the model.
        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
@@ -97,7 +101,7 @@ class LM(abc.ABC):

    # TODO: Add an optional max length
    @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> list[str]:
        """Generate greedily until a stopping sequence

        :param requests: list[Instance]
@@ -114,7 +118,7 @@ class LM(abc.ABC):
        pass

    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+        self, chat_history: list[dict[str, str]], add_generation_prompt=True
    ) -> str:
        """
        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
@@ -165,8 +169,7 @@ class LM(abc.ABC):
        - Instance of the LM class.
        """

-        additional_config = {} if additional_config is None else additional_config
-        additional_config = {
+        additional_config = additional_config or {} | {
            k: v for k, v in additional_config.items() if v is not None
        }

@@ -204,25 +207,25 @@ class LM(abc.ABC):

        return ""

-    def set_cache_hook(self, cache_hook) -> None:
+    def set_cache_hook(self, cache_hook: "CacheHook") -> None:
        self.cache_hook = cache_hook


 ### SQLite-based caching of LM responses
-def hash_args(attr, args):
+def hash_args(attr: str, args: Iterable[Any]) -> str:
    dat = json.dumps([attr] + list(args))
    return hashlib.sha256(dat.encode("utf-8")).hexdigest()


 class CacheHook:
-    def __init__(self, cachinglm) -> None:
+    def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
        if cachinglm is None:
-            self.dbdict = None
+            self.dbdict: Optional["SqliteDict"] = None
            return

        self.dbdict = cachinglm.dbdict

-    def add_partial(self, attr, req, res) -> None:
+    def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
        if self.dbdict is None:
            return
        hsh = hash_args(attr, req)
@@ -230,7 +233,7 @@ class CacheHook:


 class CachingLM:
-    def __init__(self, lm, cache_db) -> None:
+    def __init__(self, lm: LM, cache_db: str) -> None:
        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.

        :param lm: LM
@@ -238,8 +241,10 @@ class CachingLM:
        :param cache_db: str
            Path to cache db
        """
-        self.lm = lm
-        self.cache_db = cache_db
+        from sqlitedict import SqliteDict
+
+        self.lm: LM = lm
+        self.cache_db: str = cache_db
        if os.path.dirname(cache_db):
            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
        self.dbdict = SqliteDict(cache_db, autocommit=True)
@@ -247,13 +252,13 @@ class CachingLM:
        # add hook to lm
        lm.set_cache_hook(self.get_cache_hook())

-    def __getattr__(self, attr: str):
+    def __getattr__(self, attr: str) -> Any:
        lm_attr = getattr(self.lm, attr)
        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
            return lm_attr

-        def fn(requests):
+        def _fn(requests: list["Instance"]) -> list["Instance"]:
            res = []
            remaining_reqs = []
            warned = False
@@ -306,9 +311,9 @@ class CachingLM:

            return res

-        return fn
+        return _fn

-    def get_cache_hook(self):
+    def get_cache_hook(self) -> "CacheHook":
        return CacheHook(self)


@@ -331,19 +336,23 @@ class TemplateLM(LM):
        return self.eot_token_id

    @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
+    def tok_encode(self, string: str, **kwargs) -> list[int]:
        """
        Tokenize a string using the model's tokenizer and return a list of token IDs.
        """
        pass

    @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+    def _loglikelihood_tokens(
+        self, requests: list["Instance"], **kwargs
+    ) -> list[tuple[float, bool]]:
        pass

    def _encode_pair(
        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
+    ) -> tuple[list[int], list[int]]:
+        import transformers
+
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
@@ -364,8 +373,8 @@ class TemplateLM(LM):
        return context_enc, continuation_enc

    def loglikelihood(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
+        self, requests: list["Instance"], disable_tqdm: bool = False
+    ) -> list[tuple[float, bool]]:
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
@@ -384,11 +393,11 @@ class TemplateLM(LM):
    @abc.abstractmethod
    def loglikelihood_rolling(
        self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        pass

    @abc.abstractmethod
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
        pass

    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:

--- a/lm_eval/config/evaluate_config.py
+++ b/lm_eval/config/evaluate_config.py
@@ -63,9 +63,9 @@ class EvaluatorConfig:
    model_args: dict = field(
        default_factory=dict, metadata={"help": "Arguments for model initialization"}
    )
-    tasks: Union[str, list[str]] = field(
+    tasks: list[str] = field(
        default_factory=list,
-        metadata={"help": "Comma-separated list of task names to evaluate"},
+        metadata={"help": "List of task names to evaluate"},
    )

    # Few-shot and batching
@@ -212,9 +212,9 @@ class EvaluatorConfig:

        # Create instance and validate
        instance = cls(**config)
+        instance.configure()
        if used_config:
            print(textwrap.dedent(f"""{instance}"""))
-        instance.configure()

        return instance


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import itertools
 import json
 import logging
+import os
 import random
 import time
 from collections import defaultdict
@@ -30,6 +31,7 @@ from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
    get_logger,
    handle_non_serializable,
+    hash_dict_images,
    hash_string,
    positional_deprecated,
    simple_parse_args_string,
@@ -140,7 +142,6 @@ def simple_evaluate(
        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
    :param metadata: dict
        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
-
    return
        Dictionary of results
    """
@@ -153,15 +154,23 @@ def simple_evaluate(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )

+    _NEEDS_CHAT_TEMPLATE = ("inst", "chat")
    if (
-        (isinstance(model_args, str) and "inst" in model_args.lower())
+        (
+            isinstance(model_args, str)
+            and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
+        )
        or (
            isinstance(model_args, dict)
-            and any("inst" in str(v).lower() for v in model_args.values())
+            and any(
+                any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
+                for v in model_args.values()
+            )
        )
    ) and not apply_chat_template:
        eval_logger.warning(
-            "Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
+            "Model appears to be an instruct or chat variant but chat template is not applied. "
+            "Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
        )

    if delete_requests_cache:
@@ -745,6 +754,13 @@ def evaluate(
            },
        }
        if log_samples:
+            # default: hash images
+            samples = (
+                hash_dict_images(samples)
+                if os.environ.get("LMEVAL_HASHMM", "1") != "0"
+                and (hasattr(lm, "MULTIMODAL"))
+                else samples
+            )
            results_dict["samples"] = dict(samples)

        return results_dict

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -141,7 +141,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        """
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
-                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+                        - step 2 : We parse the choice with regex: r's*([A-?])', where ? varies by number of choices.
        group_select: Selects the (group_select)th match from the findall result.
        ignore_case: Ignores the case during step 1 matching
        ignore_punctuation: Remove the punctuation during step 1 matching

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -10,7 +10,6 @@ from . import (
    ibm_watsonx_ai,
    mamba_lm,
    nemo_lm,
-    neuralmagic,
    neuron_optimum,
    openai_completions,
    optimum_ipex,

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
-import copy
-import logging
-from typing import List, Optional, Tuple, Union
-
-import numpy
-import transformers
-from tqdm import tqdm
-
-import lm_eval.models.utils
-from lm_eval import utils
-from lm_eval.api.instance import Instance
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-from lm_eval.models.huggingface import HFLM
-
-
-eval_logger = logging.getLogger(__name__)
-
-
-@register_model("sparseml")
-class SparseMLLM(HFLM):
-    """
-    SparseML is an open-source model optimization toolkit that enables you to create
-    inference-optimized sparse models using pruning, quantization, and distillation
-    algorithms. Models optimized with SparseML can then be exported to the ONNX format and
-    deployed with DeepSparse for GPU-class performance on CPU hardware.
-
-    This class is a wrapper around the HuggingFace LM class to enable SparseML
-    integration with the lm-evaluation-harness.
-    """
-
-    def _create_model(
-        self,
-        pretrained: str,
-        revision: Optional[str] = "main",
-        dtype: Optional[str] = "auto",
-        trust_remote_code: Optional[bool] = False,
-        **kwargs,
-    ) -> None:
-        try:
-            from sparseml.transformers import SparseAutoModelForCausalLM
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "Package `sparseml` is not installed. "
-                "Please install it via `pip install sparseml[transformers]`"
-            )
-
-        model_kwargs = kwargs if kwargs else {}
-
-        if "device_map" not in model_kwargs:
-            # set a device_map to initialize model on the right GPU.
-            # this is needed because it seems that the default behavior
-            # for quantized models now seems to be device_map="auto"
-            # which breaks data-parallel mode.
-            if hasattr(self, "accelerator"):
-                model_kwargs.update(
-                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
-                )
-            else:
-                model_kwargs.update({"device_map": {"": str(self.device)}})
-
-        relevant_kwarg_names = [
-            "offload_folder",
-            "device_map",
-        ]
-        relevant_kwargs = {
-            k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names
-        }
-
-        # Log the difference between model_kwargs and relevant_kwargs so we can see
-        # what is being ignored
-        ignored_kwargs = {}
-        for k, v in model_kwargs.items():
-            if k not in relevant_kwargs.keys():
-                ignored_kwargs[k] = v
-        eval_logger.warning(
-            f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}"
-        )
-
-        model = SparseAutoModelForCausalLM.from_pretrained(
-            pretrained,
-            revision=revision,
-            torch_dtype=lm_eval.models.utils.get_dtype(dtype),
-            trust_remote_code=trust_remote_code,
-            **relevant_kwargs,
-        )
-        self._model = model
-
-    def _get_config(self, pretrained: str, **kwargs) -> None:
-        try:
-            from sparseml.transformers import SparseAutoConfig
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "Package `sparseml` is not installed. "
-                "Please install it via `pip install sparseml[transformers]`"
-            )
-
-        self._config = SparseAutoConfig.from_pretrained(
-            pretrained_model_name_or_path=pretrained, **kwargs
-        )
-
-    def _create_tokenizer(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ],
-        **kwargs,
-    ) -> None:
-        try:
-            from sparseml.transformers import SparseAutoTokenizer
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "Package `sparseml` is not installed. "
-                "Please install it via `pip install sparseml[transformers]`"
-            )
-
-        if tokenizer:
-            if isinstance(tokenizer, str):
-                self.tokenizer = SparseAutoTokenizer.from_pretrained(
-                    tokenizer,
-                    **kwargs,
-                )
-            else:
-                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
-                self.tokenizer = tokenizer
-        else:
-            # Get tokenizer based on 'pretrained'
-            if isinstance(pretrained, str):
-                model_name = pretrained
-            else:
-                # get the HF hub name via accessor on model
-                model_name = self.model.name_or_path
-            self.tokenizer = SparseAutoTokenizer.from_pretrained(
-                model_name,
-                **kwargs,
-            )
-        return None
-
-
-@register_model("deepsparse")
-class DeepSparseLM(LM):
-    """
-    Wrapper around DeepSparse, a sparsity-aware deep learning
-    inference runtime for CPUs, to make it compatible with the
-    lm-evaluation-harness.
-    """
-
-    _DEFAULT_MAX_LENGTH = 2048
-
-    def __init__(
-        self,
-        pretrained: str,
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ] = None,
-        batch_size: Optional[Union[int, str]] = 1,
-        max_gen_toks: Optional[int] = 256,
-        max_length: Optional[int] = None,
-    ):
-        super().__init__()
-
-        try:
-            import deepsparse
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "Package `deepsparse` is not installed. "
-                "Please install it via `pip install deepsparse[transformers]`"
-            )
-
-        if isinstance(batch_size, str) and not batch_size.isdigit():
-            eval_logger.warning(
-                f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. "
-                "Ignoring and using the default of 1."
-            )
-            batch_size = 1
-
-        self.batch_size = int(batch_size)
-        self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH
-        self._max_gen_toks = max_gen_toks
-        self.batch_sizes = {}
-
-        # Initialize new model and tokenizer instances
-        self.model = deepsparse.TextGeneration(
-            model_path=pretrained,
-            sequence_length=self._max_length,
-            batch_size=batch_size,
-        )
-        self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
-        self.config = self.model.config
-
-    def tok_encode(self, string: str) -> List[int]:
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-
-    @property
-    def prefix_token_id(self):
-        # it is used as prefix for loglikelihood
-        if self.tokenizer.bos_token_id is not None:
-            return self.tokenizer.bos_token_id
-        return self.tokenizer.eos_token_id
-
-    @property
-    def max_length(self) -> int:
-        return self._max_length
-
-    @property
-    def max_gen_toks(self) -> int:
-        return self._max_gen_toks
-
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
-        """
-        Copied directly from
-        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
-        """
-        new_reqs = []
-        for context, continuation in [req.args for req in requests]:
-            if context == "":
-                raise NotImplementedError(
-                    "Implementing empty context is not supported yet"
-                )
-            context_enc, continuation_enc = self._encode_pair(context, continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def _loglikelihood_tokens(
-        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
-        disable_tqdm: bool = False,
-    ) -> List[Tuple[float, bool]]:
-        """
-        The function to compute the loglikelihood of the continuation
-        tokens given the context tokens.
-
-        This function is an adapted version of the original function from
-        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
-        """
-        res = []
-
-        def _collate(x):
-            """Defines the key for the sorted method"""
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        for chunk in tqdm(
-            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            batch_inp = []
-            batch_cache_key = []
-            batch_continuation_enc = []
-            # len(chunk) is the batch_size
-            for cache_key, context_enc, continuation_enc in chunk:
-                # how this all works (illustrated on a causal decoder-only setup):
-                #          CTX      CONT
-                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
-                # model  \               \
-                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
-
-                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
-
-                batch_inp.append(self.tokenizer.decode(inp))
-                batch_cache_key.append(cache_key)
-                batch_continuation_enc.append(continuation_enc)
-
-            response = self.model(
-                prompt=batch_inp,
-                max_new_tokens=0,
-                output_scores=True,
-                include_prompt_logits=True,
-            )
-
-            for resp, continuation_enc, cache_key in zip(
-                response.generations, batch_continuation_enc, batch_cache_key
-            ):
-                # (seq_len, vocab_size)
-                multi_scores = resp.score
-
-                from deepsparse.utils.data import numpy_log_softmax
-
-                # (seq_len, vocab_size) but with softmax applied
-                multi_logits = numpy_log_softmax(multi_scores, axis=1)
-                # toss out the context half of the sequence
-                # (cont_len, vocab_size)
-                continuation_multi_logits = multi_logits[-len(continuation_enc) :]
-
-                # pick out the logits for the continuation tokens
-                # (cont_len,)
-                continuation_logits = continuation_multi_logits[
-                    numpy.arange(len(continuation_enc)), continuation_enc
-                ]
-                # check if the tokens generated greedly are the same
-                # as the expected continuation
-                greedy_tokens = continuation_multi_logits.argmax(axis=1)
-                max_equal = greedy_tokens.tolist() == continuation_enc
-
-                # Answer: (log prob, is-exact-match)
-                answer = (float(continuation_logits.sum()), bool(max_equal))
-
-                res.append(answer)
-
-                if cache_key is not None:
-                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
-                    # all with cache key None. instead do add_partial on the per-example level
-                    # in the loglikelihood_rolling() function for those.
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-
-        return re_ord.get_original(res)
-
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
-        raise NotImplementedError(
-            "The method not required by any of our current task integrations so far"
-        )
-
-    def generate_until(self, requests: List[Instance]) -> List[str]:
-        """
-        The function to generate a certain number of new tokens
-        given a context.
-
-        This function is an adapted version of the original function from
-        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
-        """
-        if not requests:
-            return []
-        res = []
-        requests = [req.args for req in requests]
-
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
-
-        pbar = tqdm(total=len(requests))
-        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
-        ):
-            inps = []
-
-            # make a deepcopy since we are changing arguments
-            request_args = copy.deepcopy(request_args)
-
-            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
-
-            for context, _ in chunk:
-                # add context (prompts) to the list
-                inps.append(context)
-
-            until = request_args.pop("until", ["<|endoftext|>"])
-            request_args.pop("do_sample", None)
-            request_args["temperature"] = request_args.get("temperature", 0)
-
-            # run inference (generate max_gen_toks tokens)
-            out = self.model(
-                sequences=inps,
-                max_new_tokens=self.max_gen_toks - 1,
-                stop=until,
-                **request_args,
-            )
-
-            for resp, (context, args_) in zip(out.generations, chunk):
-                text = resp.text
-                until_ = until
-                # split the text at the first occurrence of any of the until tokens
-                for term in until_:
-                    if len(term) > 0:
-                        text = text.split(term)[0]
-
-                res.append(text)
-
-                self.cache_hook.add_partial(
-                    "generate_until", (context, {"until": until_}), text
-                )
-                pbar.update(1)
-
-        pbar.close()
-
-        return re_ord.get_original(res)
-
-    def _encode_pair(
-        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
-        """
-        Copied directly from
-        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
-        """
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg.yaml
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic.yaml
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn.yaml
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -613,3 +613,59 @@ def weighted_f1_score(items):
    preds = unzipped_list[1]
    fscore = f1_score(golds, preds, average="weighted")
    return fscore
+
+
+def convert_pil_to_hash(value):
+    from io import BytesIO
+
+    img_bytes = BytesIO()
+    value.save(img_bytes, format="PNG")
+    return hashlib.sha256(str(img_bytes).encode()).hexdigest()
+
+
+def convert_bytes_to_hash(value):
+    return hashlib.sha256(str(value).encode()).hexdigest()
+
+
+def hash_dict_images(data_dict):
+    """
+    Create a deep copy of `data_dict` where all bytes and PIL.Image.Image values
+    are replaced by their respective hashes using the provided converter functions.
+
+    Parameters:
+        data_dict (dict): The input dictionary with arbitrary nesting of dicts and lists.
+
+    Returns:
+        dict: A new dictionary with the same structure as `data_dict`, but with all
+              bytes and PIL.Image.Image objects replaced by their hashes.
+    """
+
+    def _process_value(value):
+        # Bytes -> hash
+        from PIL import Image
+
+        if isinstance(value, (bytes, bytearray)):
+            return convert_bytes_to_hash(value)
+        # PIL Image -> hash
+        if isinstance(value, Image.Image):
+            return convert_pil_to_hash(value)
+        # Nested dictionary -> recurse
+        if isinstance(value, dict):
+            return {k: _process_value(v) for k, v in value.items()}
+        # List or tuple -> recurse, preserving type
+        if isinstance(value, list):
+            return [_process_value(v) for v in value]
+        if isinstance(value, tuple):
+            return tuple(_process_value(v) for v in value)
+        # Other types remain unchanged
+        return value
+
+    # Ensure the top-level is a dict
+    if not isinstance(data_dict, dict):
+        raise TypeError("Input must be a dictionary")
+
+    return (
+        {key: _process_value(val) for key, val in data_dict.items()}
+        if importlib.util.find_spec("PIL")
+        else data_dict
+    )